# Data Mining Final Project
James Crim copy/version of script

Teammates are Amy Sharp and Chirag Lakhanpal!

## Setup

In [49]:
from IPython.core.display import display, HTML;
display(HTML("<style>.container { width:95% !important; }</style>"));

In [50]:
# import packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import wbgapi as wb
import statsmodels.api as sm
import sklearn as skl

# sklearn subpackages
from sklearn import model_selection as mdl_slct
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import RepeatedKFold
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# plot figures larger
plt.rcParams['figure.figsize'] = [24, 6]

# set seed
np.random.seed(2022)

## Importing and Cleaning Data

In [51]:
# set up for importing data from API

# get set of country codes to use to import data from world bank
country_names_raw = pd.read_csv('CountrUsedN(1).csv', header=None)
country_names = country_names_raw.iloc[:, 0].to_list()
country_codes_list = list(wb.economy.coder(country_names).values())
country_codes_set = set(country_codes_list)

# years of interest ("since 1994")
year_list = range(1994, 2023)

# the seven indexes to use (copied from WorldBankIndUsed.docx from the Data Mining Blackboard)
# note: CO2 emmissions per captia (CO2 kt / pop) is the outcome variable, the other x variables are predictor variables
world_bank_index_codes = ['EN.ATM.CO2E.KT', 'NY.GDP.MKTP.PP.KD',  'SP.POP.TOTL', 'EG.FEC.RNEW.ZS', 'SP.URB.TOTL.IN.ZS', 'NV.MNF.TECH.ZS.UN', 'NE.TRD.GNFS.ZS']

# names manually typed up by me (same order as indexes, will probably be unused):
world_bank_index_names_long = ['CO2 Emissions (kt)', 'GDP per Capita, PPP (Constant 2017 International $)', 'Population, Total', 'Renewable Energy Consumption (% of Total Final Energy)',
                                 'Urban Population (% of Total Population)', 'Medium and High-Tech Manufacturing Value Added (% Manufacturing Value Added)', 'Trade (% of GDP)']
world_bank_index_names_short = ['CO2 Emissions', 'GDP per Capita', 'Population', '% Renewable Energy', '% Urban', '% Med/High Manufacturing', '% Trade of GDP']

In [52]:
# import the data from the API
# URL of API package: https://pypi.org/project/wbgapi/
raw_dat = wb.data.DataFrame(world_bank_index_codes, country_codes_set, year_list, labels=True)

# swap around the way the three dimensions are multi-indexed such that country and then year are on the x axis (rows) and economic indicators are on the y axis (columns)
reordered_data = raw_dat.pivot(index='Series', columns='Country').transpose().swaplevel().sort_index()

# renaming columns to shorter names (unused)
#dat.columns = world_bank_index_names_short

# remove missing data
dat = dat.dropna()

# detect any missing data, all zeros means no missing data
np.sum(dat.isna())

# save created data
reordered_data.to_pickle("Data Mining Final Project Starting Data (WB Data).pkl")

In [53]:
# load created data
dat = pd.read_pickle("Data Mining Final Project Starting Data (WB Data).pkl")

## Functions to Get Subsections of the Cleaned Data

In [78]:
# function to get a 2D dataframe of all country and measure data from a specific, single year
def get_year_data(year):
    return pd.DataFrame(dat.swaplevel().loc['YR'+str(year)])

# function to get a 2D dataframe of all years and measure data from a specific, single country
def get_country_data(country):
    return dat.loc[country]

# function to get a 1D multiplex dataframe of all years and all countries for a specific, single measure
def get_measure_data(measure_name):
    return pd.DataFrame(dat.loc[:, measure_name])

# function to get 1D multiplex dataframe of country data from a specific range of years for a specific measure
def get_specific_measures_for_specific_years(measure_name, year_start, year_end):
    year_end = year_end+1
    years_of_interest = np.char.add(np.repeat('YR', year_end-year_start), np.array(range(year_start, year_end)).astype(str))
    #print(years_of_interest_names)
    return pd.DataFrame(dat.swaplevel().loc[years_of_interest, measure_name])

# example use of functions
#get_country_data('United States')
#get_year_data(2003)
#get_measure_data('Population, total')
#get_specific_measures_for_specific_years(measure_name='Population, total', year_start=1994, year_end=2008)

# Instructions

Note: The following instructions are copied directly from Blackboard and the files downloaded from there about our final project.

Assigned Model:

Mixed Effects Random Forests in Python

Provided Link: https://towardsdatascience.com/mixed-effects-random-forests-6ecbb85cb177

World Bank Data

Consider

1. around 58 countries (subject to data availability), the list of countries to start from is attached 

2. macroeconomic indexes (attached)

3. measured since 1994.
 
CO2 emission per capita, is a response, all other indexes are covariates.   
 
Tasks

1. Study patterns of heterogeneity in relationships between the response and covariates.

2. Build a model to predict a response.

Deadlines:

1. Dec 8th (Thursday): In-class presentation
2. Dec 20th (Tuesday): Final report due