In [108]:
# Notes: For Emissions: "" and -- means missing
# For Energy_Use, "" is missing, For Energy_consumption -- means missing
# For Fossil_fuel_Energy both 0 and "" mean missing, can actually omit this and use renewable_energy instead (too much missing data)
#for GDP and Population -- means missing
# For Renewable_energy_cons "" means missing
# for Rents "" missing
import pandas as pd
import os
import numpy as np

import missingno as mno
from sklearn.impute import KNNImputer
from sklearn import linear_model

from sklearn.preprocessing import PolynomialFeatures

In [109]:
# Load all data and keep only those whose ISO matches all the datasets
countries = pd.read_excel("./countries.xlsx")

def getData(file):
    data = pd.read_csv(file, skiprows=1)
    
    data["Country"]=data["Country"].apply(lambda x: x.strip()) #strip trailing spaces
    data = data[data["Code"].isin(countries["Code"])] # check to see if it is in list
    data.reset_index(drop=True,inplace=True)
    
    return data

In [110]:
#Process all files and add them to a list of dataframes for easy manipulation

df_list = []
df_names = []
for filename in os.listdir("data_processed/"):
    #print(filename)
    path = "data_processed/" + filename
    df_list.append(getData(path))
    df_names.append(filename)

for i in range(15):
    #print(df_list[i].shape)
    print(df_names[i])

#mno.matrix(df_list[7], figsize = (10, 3))
#mno.matrix(df_list[8], figsize = (10, 3))

Emissions_Coal.csv
Emissions_GHG_(fromCAIT).csv
Emissions_Natural_Gas.csv
Emissions_Petroleum_other.csv
Emissions_Total.csv
Energy_Consumption_per_Capita.csv
Energy_consumption_per_GDP.csv
Energy_use(kg-of-oil-equivalent-per-capita).csv
Fossil_fuel_energy_consumption(%_of_total).csv
GDP.csv
Population.csv
Renewable_energy_consumption(%_of_total_final_energy_consumption).csv
Rents_Coal.csv
Rents_NaturalGas.csv
Rents_Oil.csv


### KNN imputation

In [79]:
def KNNimpute(dataframe):
    temp = dataframe
    temp = temp.drop(columns=['Code','Country'])
    temp = temp.transpose()

    temp = temp.replace('', np.NaN)
    temp = temp.replace('--', np.NaN)
     
    #impute
    imputer = KNNImputer(n_neighbors = 3)
    temp = pd.DataFrame(imputer.fit_transform(temp))
    
    #transpose back and add columns names and index back
    temp = temp.transpose()
    temp.insert(0,'c1',dataframe.Country)
    temp.insert(1,'c2',dataframe.Code)
    temp.columns = dataframe.columns
    return temp

In [None]:
KNNimpute(df_list[0].head(15))


### Inter/Extra-polation method with added noise

In [None]:
# https://www.kaggle.com/shashankasubrahmanya/missing-data-imputation-using-regression

In [111]:
def clean_df(temp):
    temp = temp.drop(columns=['Country','Code'])
    temp = temp.transpose()
    temp = temp.replace('', np.NaN)
    temp = temp.replace('--', np.NaN)
    return temp

#randomly impute missing values by sampling from original 
np.random.seed(42)
def random_imputation(df, df_imp, feature):
    number_missing = df[feature].isnull().sum()
    observed_values = df.loc[df[feature].notnull(), feature]
    df_imp.loc[df[feature].isnull(), feature] = np.random.choice(observed_values, number_missing, replace = True)
    
    return df_imp[feature]

In [None]:
StochasticRegression(df_list[0]).head(15)

In [None]:
def StochasticRegression(dataframe):    
    df = clean_df(dataframe.copy())
    df.columns = df.columns.map(str)
    missing_columns = df.columns

    for feature in missing_columns:
        df[feature + '_imp'] = df[feature]
        df = random_imputation(df, feature)

    random_data = pd.DataFrame(columns = ["Ran" + name for name in missing_columns])

    for feature in missing_columns:

        random_data["Ran" + feature] = df[feature + '_imp']
        parameters = list(set(df.columns) - set(missing_columns) - {feature + '_imp'})
        
        model = linear_model.LinearRegression()
        model.fit(X = df[parameters], y = df[feature + '_imp'])
        if feature == '5':
            print(random_data["Ran" + feature])
        #Standard Error of the regression estimates is equal to std() of the errors of each estimates
        predict = model.predict(df[parameters])
        std_error = (predict[df[feature].notnull()] - df.loc[df[feature].notnull(), feature + '_imp'].astype(float)).std()


        random_predict = np.random.normal(size = df[feature].shape[0], 
                                          loc = predict, 
                                          scale = std_error)
        random_data.loc[(df[feature].isnull()) & (random_predict > 0), "Ran" + feature] = random_predict[(df[feature].isnull()) & 
                                                                                (random_predict > 0)]
    
    random_data = random_data.transpose()
    random_data.reset_index(drop=True, inplace=True)
    random_data.insert(0,'c1',dataframe.Country)
    random_data.insert(1,'c2',dataframe.Code)
    random_data.columns = dataframe.columns
    return random_data

In [141]:
def LinReg_imp(dataframe):
    df = clean_df(dataframe.copy())
    
    for feature in df.columns:
        if df[feature].isnull().sum() == 0: # break if nothing to impute
            continue
        #train on available data, then predict missing data
        model = linear_model.LinearRegression()
        X = df[feature][df[feature].notnull()].index
        X = np.array(X).reshape(-1,1)
        y = df[feature][df[feature].notnull()]
        model.fit(X, y)


        y_missing = df[feature][df[feature].isnull()].index
        y_missing = np.array(y_missing).reshape(-1,1)
        y_predicted = model.predict(y_missing)

        #Standard Error of the regression estimates is equal to std() of the errors of each estimates
        df.loc[df[feature].isnull(),feature] = y_predicted

    temp = df.transpose()
    temp.reset_index(drop=True, inplace=True)
    temp.insert(0,'c1',dataframe.Country)
    temp.insert(1,'c2',dataframe.Code)
    temp.columns = dataframe.columns

    return temp


In [144]:
# USing n-th degree polynomial regression
def PolyReg_imp(dataframe, poly_degrees):
    df = clean_df(dataframe.copy())

    for feature in df.columns:
        if df[feature].isnull().sum() == 0: # break if nothing to impute
            continue
        #train on available data, then predict missing data
        X = df[feature][df[feature].notnull()].index
        X = np.array(X).reshape(-1,1)
        y = df[feature][df[feature].notnull()]

        poly = PolynomialFeatures(degree = poly_degrees)
        X_poly = poly.fit_transform(X)

        poly.fit(X_poly, y)
        lin2 = linear_model.LinearRegression()
        lin2.fit(X_poly, y)


        y_missing = df[feature][df[feature].isnull()].index
        y_missing = np.array(y_missing).reshape(-1,1)

        X_polytest = poly.fit_transform(y_missing)
        y_predicted = lin2.predict(X_polytest)
        #Standard Error of the regression estimates is equal to std() of the errors of each estimates
        df.loc[df[feature].isnull(),feature] = y_predicted 
    
    temp = df.transpose()
    temp.reset_index(drop=True, inplace=True)
    temp.insert(0,'c1',dataframe.Country)
    temp.insert(1,'c2',dataframe.Code)
    temp.columns = dataframe.columns

    return temp


In [None]:
xx = df.isnull().sum(axis = 0)
xx = xx[xx>0]
xx