In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import math

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

In [None]:
df = pd.read_csv('dataset/worldbank/API.csv')
meta_country = pd.read_csv('dataset/worldbank/Metadata_Country_API_19_DS2_en_csv_v2_3159902.csv')
meta_indicator = pd.read_csv('dataset/worldbank/Metadata_Indicator_API_19_DS2_en_csv_v2_3159902.csv')

## Dataset Overview

In [None]:
df.info()

In [None]:
df.describe()

## Missing value identification

In [None]:
# function to count missing value
def get_nan(df, sort=False):
    missing_value = df[df.isna().values.any(axis=1)]
    rows = missing_value.shape[0]
    print(rows, "rows with missing values")
    
    if rows < 0:
        return 0
    if sort:
        print(df.isna().sum().sort_values())
    else:
        print(df.isna().sum())
        

In [None]:
get_nan(df, sort=True)

In [None]:
# function to visualize the proportion of missing value
def visual_nan(df, name="", isSorted=False, savePlot=True):
    
    sort = ''
    if isSorted:
        df=df[df.isna().sum().sort_values().keys()]
        sort = '_sorted'
    missing_value = df.isna().melt(value_name="missing")

    ax = sns.displot(data=missing_value,
                     y="variable",
                     hue="missing",
                     multiple="fill",
                     height=10,
                     aspect=1.5)
    plt.xlabel("Proportion of missing value")
    plt.title(f"Missing values of {name}")
    
    if savePlot:
        filename = name.split('.')
        plt.savefig(f"plot/{filename[0]}{sort}.png")

    return ax

In [None]:
visual_nan(df, 'API.csv', True)

In [None]:
missing = pd.DataFrame(df.isna().sum(), columns=['missing'])
missing.reset_index(inplace=True)
missing.rename(columns={'index':'col'}, inplace=True)
missing['ratio'] = (missing['missing']/25204*100).round(2)

missing.sort_values(by=['ratio'])

In [None]:
missing.ratio.hist(bins=10)

In [None]:
unique_countries = df['Country Name'].unique()
print(unique_countries)
print(len(unique_countries))

In [None]:
unique_country_codes = df['Country Code'].unique()
print(unique_country_codes)
print(len(unique_country_codes))

In [None]:
unique_indicator_names = df['Indicator Name'].unique()
print(unique_indicator_names)
print(len(unique_indicator_names))

In [None]:
unique_indicator_codes = df['Indicator Code'].unique()
print(unique_indicator_codes)
print(len(unique_indicator_codes))

In [None]:
px.histogram(df, x="Country Name")

In [None]:
px.histogram(df, x="Country Code")

In [None]:
px.histogram(df, x="Indicator Name")

In [None]:
px.histogram(df, x="Indicator Code")

## Top countries that produce the most Carbon Dioxide 

China, US, India

In [None]:
def get_country_data(df, country_name, keep_indicator_code=False):
    df_country = df[df['Country Name'] == country_name].copy()
    if len(df_country) == 0:
        return 0 # country name does not exist

    if keep_indicator_code:
        header_to_keep = 'Indicator Code'
        header_to_drop = 'Indicator Name'

    else:
        header_to_keep = 'Indicator Name'
        header_to_drop = 'Indicator Code'
        
    df_country.drop(columns=['Country Name', 'Country Code', f'{header_to_drop}'], inplace=True)
    df_country.reset_index(inplace=True, drop=True)
    df_country.set_index(f'{header_to_keep}')  
    df_country = df_country.transpose()
    df_country.reset_index(inplace=True)
    
    new_header = df_country.iloc[0] # grab the first row for the header
    df_country = df_country[1:] # take the data but not header
    df_country.columns = new_header # set the header row as the df header
    df_country.rename(columns={f'{header_to_keep}': 'Year'}, inplace=True)
    
    # reset column type to float
    headers = df_country.columns.tolist()[1:] # all headers except 'Year'
    for h in headers:
        df_country[h] = df_country[h].apply(pd.to_numeric, errors='coerce')
    
    return df_country

In [None]:
# function to get indicator name
def indicator_code_to_name(code):
    col = meta_indicator[meta_indicator['INDICATOR_CODE']==code]
    return str(col['INDICATOR_NAME'][0])

# function to get indicator code
def indicator_name_to_code(name):
    col = meta_indicator[meta_indicator['INDICATOR_NAME']==name]
    return str(col['INDICATOR_CODE'][0])
    

In [None]:
df_china = get_country_data(df, "China", True)
df_china

In [None]:
def missing_ratio_summary(df):
    missing = pd.DataFrame(df.isna().sum(), columns=['missing'])
    missing.reset_index(inplace=True)
    missing.rename(columns={0:'col'}, inplace=True)
    missing['ratio'] = (missing['missing']/len(df)*100).round(2)

    display(missing.sort_values(by=['ratio']))

In [None]:
df_china.info()

In [None]:
df_china.describe()

In [None]:
missing_ratio_summary(df_china)

In [None]:
df_china.describe()

In [None]:
missing_ratio_summary(df_china)

In [None]:
null_china = df_china.isna().sum()

# create figure
fig = plt.figure(figsize=(14, 8), dpi=80)
ax = fig.add_axes([0,0,1,1])
# set Y axis label
ax.set_ylabel('count')
# set orientation for X axis labels
plt.xticks(rotation='vertical')
# draw bar chart
ax.bar(df_china.columns, null_china)
plt.show()

In [None]:
china_percent_missing = df_china.isnull().sum() * 100 / len(df_china)
china_missing_value_df = pd.DataFrame({'column_name': df_china.columns,
                                 'percent_missing': china_percent_missing})

china_missing_value_df.sort_values('percent_missing', inplace=True)
percent_missing = df_china.isnull().sum() * 100 / len(df_china)

# create figure
fig = plt.figure(figsize=(14, 8), dpi=80)
ax = fig.add_axes([0,0,1,1])
# set Y axis label
ax.set_ylabel('Percentage')
# set orientation for X axis labels
plt.xticks(rotation='vertical')
# draw bar chart
ax.bar(df_china.columns, china_percent_missing)
plt.show()

In [None]:
china_percent_missing

In [None]:
df_us = get_country_data(df, "United States", True)
df_us

In [None]:
missing_ratio_summary(df_us)

In [None]:
df_india = get_country_data(df, "India", True)
df_india

In [None]:
missing_ratio_summary(df_us)

### Drop Columns with Missing Values > 70%

In [None]:
perc = 70.0
min_count =  int(((100-perc)/100)*df_china.shape[0] + 1)
df_china = df_china.dropna( axis=1, 
                thresh=min_count)
min_count =  int(((100-perc)/100)*df_us.shape[0] + 1)
df_us = df_us.dropna( axis=1, 
                thresh=min_count)
min_count =  int(((100-perc)/100)*df_india.shape[0] + 1)
df_india = df_india.dropna( axis=1, 
                thresh=min_count)

In [None]:
df_china.head()

### Dataframe Intersection

In [None]:
from functools import reduce
common_cols = reduce(np.intersect1d,(df_china.columns, df_us.columns, df_india.columns))

In [None]:
common_cols

### Multivariate Imputation

Next, we will be using multivariate imputation to fill up missing values

In [None]:
print(df_china.shape)
print(df_us.shape)
print(df_india.shape)

In [None]:
print(df_china.shape[0] *0.8)
print(df_us.shape[0] *0.8)
print(df_india.shape[0] *0.8)

In [None]:
def multivariate_imputation(df):

    df_train = df.iloc[:-25] 
    df_test = df[-25:] # last 25 years

    print(f'Training years: {df_train.shape[0]}')
    print(f'Test years: {df_test.shape[0]}')

    imp = IterativeImputer(max_iter=10, random_state=2022)
    imp.fit(df_train.iloc[:,3:]) 

    df_predict_array = imp.transform(df_test.iloc[:,3:])

    columns = df.columns.tolist()[3:]
    df_predict = pd.DataFrame(df_predict_array, columns=columns) # array to df

    df_predict['iso_code'] = df['iso_code']
    df_predict['country'] = df['country']
    df_predict['year'] = df_test['year'].to_list() # predicted years

    df_predict = df_predict.reindex(columns=df_test.columns.tolist())
    return df_predict