In [19]:
# Notes: For Emissions: "" and -- means missing
# For Energy_Use, "" is missing, For Energy_consumption -- means missing
# For Fossil_fuel_Energy both 0 and "" mean missing, can actually omit this and use renewable_energy instead (too much missing data)
#for GDP and Population -- means missing
# For Renewable_energy_cons "" means missing
# for Rents "" missing
import pandas as pd
import os
import numpy as np

import missingno as mno
from sklearn.impute import KNNImputer
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# Load all data and keep only those whose ISO matches all the datasets
countries = pd.read_excel("./countries.xlsx")

def getData(file):
    data = pd.read_csv(file, skiprows=1)
    
    data["Country"]=data["Country"].apply(lambda x: x.strip()) #strip trailing spaces
    data = data[data["Code"].isin(countries["Code"])] # check to see if it is in list
    data.reset_index(drop=True,inplace=True)
    
    return data

In [3]:
#Process all files and add them to a list of dataframes for easy manipulation

df_list = []
df_names = []
for filename in os.listdir("data_processed/"):
    #print(filename)
    path = "data_processed/" + filename
    df_list.append(getData(path))
    df_names.append(filename)

for i in range(15):
    #print(df_list[i].shape)
    print(df_names[i])

#mno.matrix(df_list[7], figsize = (10, 3))
#mno.matrix(df_list[8], figsize = (10, 3))

Emissions_Coal.csv
Emissions_GHG_(fromCAIT).csv
Emissions_Natural_Gas.csv
Emissions_Petroleum_other.csv
Emissions_Total.csv
Energy_Consumption_per_Capita.csv
Energy_consumption_per_GDP.csv
Energy_use(kg-of-oil-equivalent-per-capita).csv
Fossil_fuel_energy_consumption(%_of_total).csv
GDP.csv
Population.csv
Renewable_energy_consumption(%_of_total_final_energy_consumption).csv
Rents_Coal.csv
Rents_NaturalGas.csv
Rents_Oil.csv


### KNN imputation

In [4]:
def KNNimpute(dataframe):
    temp = dataframe
    temp = temp.drop(columns=['Code','Country'])
    temp = temp.transpose()

    temp = temp.replace('', np.NaN)
    temp = temp.replace('--', np.NaN)
     
    #impute
    imputer = KNNImputer(n_neighbors = 3)
    temp = pd.DataFrame(imputer.fit_transform(temp))
    
    #transpose back and add columns names and index back
    temp = temp.transpose()
    temp.insert(0,'c1',dataframe.Country)
    temp.insert(1,'c2',dataframe.Code)
    temp.columns = dataframe.columns
    return temp

In [5]:
KNNimpute(df_list[0].head(15))


Unnamed: 0,Country,Code,1990,1991,1992,1993,1994,1995,1996,1997,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Albania,ALB,3.130703,1.779817,0.502372,0.332177,0.269835,0.205073,0.16131,0.119023,...,0.329973,0.309336,0.277316,0.246716,0.311598,0.264177,0.205667,0.204719,0.20428,0.234532
1,Algeria,DZA,3.136276,2.898887,2.687475,2.110768,2.150592,1.284432,1.44313,1.537015,...,1.099877,1.407936,1.388799,0.982267,0.833787,0.513685,0.396965,0.410189,0.232647,0.352335
2,Angola,AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Antigua and Barbuda,ATG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Argentina,ARG,3.373197,2.903367,2.906775,2.671719,3.631131,3.278262,3.042341,2.907947,...,3.664644,3.937608,4.320512,4.357923,4.120342,3.856025,3.725291,3.656726,3.053126,1.97429
5,Armenia,ARM,0.220518,0.164039,0.351333,0.220518,0.089702,0.05108,0.040175,0.02927,...,0.005791,0.007508,0.006603,0.003152,0.002484,0.002798,0.002849,0.003707,0.004417,0.011523
6,Australia,AUS,155.822075,158.44184,162.299485,157.161279,157.663172,164.275304,175.733781,180.658842,...,193.453158,184.804494,180.995687,168.465633,160.055324,167.850923,166.152335,160.700259,146.703499,141.675568
7,Austria,AUT,14.127443,15.232404,10.807675,9.358016,9.760491,11.893954,12.51665,12.795775,...,12.262878,12.373503,11.797346,12.090293,11.263249,12.029014,11.297731,11.916427,10.410354,11.069428
8,Azerbaijan,AZE,0.025163,0.026736,0.063694,0.009436,0.002359,0.014154,0.004718,0.007304,...,0.017598,0.013336,0.01388,0.012824,0.004828,0.000342,0.000733,0.006178,0.005768,0.029498
9,Bahrain,BHR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Inter/Extra-polation method with added noise

In [6]:
# https://www.kaggle.com/shashankasubrahmanya/missing-data-imputation-using-regression

In [7]:
def clean_df(temp):
    temp = temp.drop(columns=['Country','Code'])
    temp = temp.transpose()
    temp = temp.replace('', np.NaN)
    temp = temp.replace('--', np.NaN)
    return temp

#randomly impute missing values by sampling from original 
np.random.seed(42)
def random_imputation(df, df_imp, feature):
    number_missing = df[feature].isnull().sum()
    observed_values = df.loc[df[feature].notnull(), feature]
    df_imp.loc[df[feature].isnull(), feature] = np.random.choice(observed_values, number_missing, replace = True)
    
    return df_imp[feature]

In [8]:
def StochasticRegression(dataframe):    
    df = clean_df(dataframe.copy())
    df.columns = df.columns.map(str)
    missing_columns = df.columns

    for feature in missing_columns:
        df[feature + '_imp'] = df[feature]
        df = random_imputation(df, feature)

    random_data = pd.DataFrame(columns = ["Ran" + name for name in missing_columns])

    for feature in missing_columns:

        random_data["Ran" + feature] = df[feature + '_imp']
        parameters = list(set(df.columns) - set(missing_columns) - {feature + '_imp'})
        
        model = linear_model.LinearRegression()
        model.fit(X = df[parameters], y = df[feature + '_imp'])
        if feature == '5':
            print(random_data["Ran" + feature])
        #Standard Error of the regression estimates is equal to std() of the errors of each estimates
        predict = model.predict(df[parameters])
        std_error = (predict[df[feature].notnull()] - df.loc[df[feature].notnull(), feature + '_imp'].astype(float)).std()


        random_predict = np.random.normal(size = df[feature].shape[0], 
                                          loc = predict, 
                                          scale = std_error)
        random_data.loc[(df[feature].isnull()) & (random_predict > 0), "Ran" + feature] = random_predict[(df[feature].isnull()) & 
                                                                                (random_predict > 0)]
    
    random_data = random_data.transpose()
    random_data.reset_index(drop=True, inplace=True)
    random_data.insert(0,'c1',dataframe.Country)
    random_data.insert(1,'c2',dataframe.Code)
    random_data.columns = dataframe.columns
    return random_data

In [9]:
def LinReg_imp(dataframe):
    df = clean_df(dataframe.copy())
    
    for feature in df.columns:
        if df[feature].isnull().sum() == 0: # break if nothing to impute
            continue
        #train on available data, then predict missing data
        model = linear_model.LinearRegression()
        X = df[feature][df[feature].notnull()].index
        X = np.array(X).reshape(-1,1)
        y = df[feature][df[feature].notnull()]
        model.fit(X, y)


        y_missing = df[feature][df[feature].isnull()].index
        y_missing = np.array(y_missing).reshape(-1,1)
        y_predicted = model.predict(y_missing)

        #Standard Error of the regression estimates is equal to std() of the errors of each estimates
        df.loc[df[feature].isnull(),feature] = y_predicted

    temp = df.transpose()
    temp.reset_index(drop=True, inplace=True)
    temp.insert(0,'c1',dataframe.Country)
    temp.insert(1,'c2',dataframe.Code)
    temp.columns = dataframe.columns

    return temp


In [10]:
# USing n-th degree polynomial regression
def PolyReg_imp(dataframe, poly_degrees):
    df = clean_df(dataframe.copy())

    for feature in df.columns:
        if df[feature].isnull().sum() == 0: # break if nothing to impute
            continue
        #train on available data, then predict missing data
        X = df[feature][df[feature].notnull()].index
        X = np.array(X).reshape(-1,1)
        y = df[feature][df[feature].notnull()]

        poly = PolynomialFeatures(degree = poly_degrees)
        X_poly = poly.fit_transform(X)

        poly.fit(X_poly, y)
        lin2 = linear_model.LinearRegression()
        lin2.fit(X_poly, y)


        y_missing = df[feature][df[feature].isnull()].index
        y_missing = np.array(y_missing).reshape(-1,1)

        X_polytest = poly.fit_transform(y_missing)
        y_predicted = lin2.predict(X_polytest)
        #Standard Error of the regression estimates is equal to std() of the errors of each estimates
        df.loc[df[feature].isnull(),feature] = y_predicted 
    
    temp = df.transpose()
    temp.reset_index(drop=True, inplace=True)
    temp.insert(0,'c1',dataframe.Country)
    temp.insert(1,'c2',dataframe.Code)
    temp.columns = dataframe.columns

    return temp


In [70]:
df = clean_df(df_list[0].copy())

feature = 5
model = linear_model.LinearRegression()
X = df[feature][df[feature].notnull()].index
X = np.array(X).reshape(-1,1)
y = df[feature][df[feature].notnull()]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
error = np.abs(y_test.astype(np.float) - y_pred)/y_test.astype(np.float)
error.sum()

28.05352591716192

In [71]:
poly = PolynomialFeatures(degree = 2)
X_poly = poly.fit_transform(X_train)

poly.fit(X_poly, y_train)
lin2 = linear_model.LinearRegression()
lin2.fit(X_poly, y_train)

X_polytest = poly.fit_transform(X_test)
y_pred = lin2.predict(X_polytest)
error = np.abs(y_test.astype(np.float) - y_pred)/y_test.astype(np.float)
error.sum()

13.979945238712594

In [83]:
imputer = KNNImputer(n_neighbors = 3)
#temp = pd.DataFrame(imputer.fit_transform(df))
df[5]

1990            NaN
1991            NaN
1992    0.351333388
1993    0.220517767
1994    0.089702145
1995    0.051080392
1996     0.04017517
1997    0.029269948
1998    0.018364726
1999    0.012736337
2000     0.00607375
2001    0.004687995
2002    0.004485106
2003     0.00680737
2004    0.003953347
2005    0.001729256
2006    0.002788238
2007    0.006802404
2008    0.006249242
2009    0.000892037
2010    0.005790765
2011    0.007507562
2012    0.006603066
2013    0.003152399
2014     0.00248405
2015    0.002797653
2016     0.00284884
2017    0.003707182
2018    0.004416816
2019    0.011522965
Name: 5, dtype: object

In [12]:
xx = df_list[0].isnull().sum(axis = 0)
xx = xx[xx>0]
xx

Series([], dtype: int64)

In [91]:
for i in range(len(df_list)):
    df_list[i].to_csv(df_names[i])
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [89]:
df_list[0].to_csv(df_names[0] - '.csv' + '_KNN' + '.csv')

TypeError: unsupported operand type(s) for -: 'str' and 'str'