In [8]:
import pandas as pd

# Dataset location

datasetFolder="datasets/"
datasetFiles = {
    "Agricultural Land"                :"agriculturalLand.csv",
    "Debt"                             :"centralGovernmentDebt.csv",
    "CO2 Emissions"                    :"CO2Emissions.csv",
    "Account Balance"                  :"currentAccountBalance.csv",
    "Electric Consumption"             :"electricPowerConsumption.csv",
    "Energy Imports"                   :"energyImports.csv",
    "Energy Use"                       :"energyUse.csv",
    "Forest Area"                      :"forestArea.csv",
    "Fuel Exports"                     :"fuelExports.csv",
    "GDP"                              :"GDP.csv",
    "GDP Procapite"                    :"GDPProCapite.csv",
    "Life expectancy"                  :"lifeExpectancy.csv",
    "Population"                       :"Population.csv",
    "Renewable Electricity Consumption":"renewableElectricityConsumption.csv",
    "Renewable Electricity Output"     :"renewableElectricityOutput.csv",
    "Money Reserves"                   :"TotalReserves.csv"
}

# Years of interest

interestInterval = (1990, 2016)
startingYear = 1960

# Dataframe building

dfList = []
for key, value in datasetFiles.items(): 
    tempDfList = []
    dataframe = pd.read_csv(datasetFolder+value, sep=',', header=2)
    
    # Each file considered starts from 1960. Each value equal to NaN for that year
    # is put to 0. Each following NaN for that dataset will take the closest previous
    # value different from NaN.
    # e.g.: NaN NaN NaN 1 2 3 NaN NaN 4
    # Supposing that the first NaN refers to 1960,
    # this row will turn first in:  0 NaN NaN 1 2 3 NaN NaN 4
    # and then to: 0 0 0 1 2 3 3 3 4
    dataframe[str(startingYear)].fillna(0, inplace=True)
    dataframe = dataframe.fillna(method='ffill', axis=1)
    
    # For each file is created a new dataframe in which each row
    # is composed by Country Name, Value of interest of dataset (e.g. GDP), Year of that value
    # so each country appears once for each year of interest (in this case 27 times)
    for i in range (interestInterval[0], interestInterval[1]+1):
        temp_df = dataframe[["Country Name",str(i)]]
        temp_df.is_copy = False
        temp_df['Year'] = i
        # Renaming for Country Name and adding of column Year
        temp_df.columns=["Country", key, "Year"]
        tempDfList.append(temp_df)    
    # Each element of the dfList refers to a single "new" dataset.
    # Those datasets must be merged by column
    dfList.append(pd.concat(tempDfList))
    

# Merging of datasets by columns "Country" and "Year"
df = dfList[0]
for i in range(1, len(dfList)):
    df = pd.merge(df, dfList[i], on=["Country","Year"])
    

# Columns rearrangement to have Year and Country Name as first columns
cols = df.columns.tolist()
year_i = cols.index("Year")
cols.pop(year_i)
cols = ["Year"] + cols
df = df[cols]

# Conversion, when possible, to numeric values (instead of string)
df = df.apply(pd.to_numeric, errors='ignore')
df.dtypes

Year                                   int64
Country                               object
Agricultural Land                    float64
Debt                                 float64
CO2 Emissions                        float64
Account Balance                      float64
Electric Consumption                 float64
Energy Imports                       float64
Energy Use                           float64
Forest Area                          float64
Fuel Exports                         float64
GDP                                  float64
GDP Procapite                        float64
Life expectancy                      float64
Population                           float64
Renewable Electricity Consumption    float64
Renewable Electricity Output         float64
Money Reserves                       float64
dtype: object

# Supervised Learning

## Linear Regression

In [7]:
import numpy as np
from numpy.linalg import inv

# LINEAR REGRESSION

country = "Italy"
target_df = df.loc[df['Country']==country]

# Excluding country name
target_df = target_df.drop(df.columns[[1]], axis=1)
    
class LinearRegression:
    def __init__(self):
        self.beta = None
    
    def train(self, X, y):
        X = np.c_[ np.ones(X.shape[0]), X]        
        # beta = (X^T * X)^(-1) * X^T * y
        self.beta = np.dot(np.dot(inv(np.dot(X.T, X)), X.T), y)
    
    def predict(self, X_input):
        x_input = np.c_[ np.ones(X_input.shape[0]), X_input]
        # result = x_input * beta
        return np.dot(x_input, self.beta)
    
    def _x_format(self, X):
        X = np.c_[ np.ones(X.shape[0]), X]
        return X

def dataPreprocessing(df):    
    # x_input will be the first input of our linear regression
    x_input = df[df.Year == interestInterval[1]]
    
    last_year = df.iloc[df.shape[0]-1].Year
    y = df[df.Year != interestInterval[0]]
    X = df[df.Year != last_year]

    # Year removal and conversion to matrix
    # columns[0] is "Year"
    x_input = x_input.drop(df.columns[[0]], axis=1)
    x_input = x_input.as_matrix()

    y = y.drop(df.columns[[0]], axis=1)
    y = y.as_matrix()

    X = X.drop(df.columns[[0]], axis=1)
    X = X.as_matrix()
    
    return X,y,x_input

# Prediction trial for 10 years
pred_years = 10
for i in range(1, pred_years+1):
    X,y,x_input = dataPreprocessing(target_df)
    LR = LinearRegression()
    LR.train(X,y)
    # LR's beta in this case has dimension 17x16: 
    # 17 is the number of input variable (1 for bias)
    # 16 is the number of output variable
    y_hat = LR.predict(x_input)
    
    # Rebuilding of DataFrame, reinserting the Year at the end
    columns = list(df.columns)
    columns.remove("Country")
    columns.remove("Year")
    result = pd.DataFrame(data=y_hat)
    result.columns = columns
    result.insert(loc=0, column="Year", value=interestInterval[1]+i)

    target_df = pd.concat([target_df, result])
target_df


Unnamed: 0,Year,Agricultural Land,Debt,CO2 Emissions,Account Balance,Electric Consumption,Energy Imports,Energy Use,Forest Area,Fuel Exports,GDP,GDP Procapite,Life expectancy,Population,Renewable Electricity Consumption,Renewable Electricity Output,Money Reserves
114,1990,57.257489,0.0,417550.289,-16437770000.0,4144.907442,82.727968,2583.888077,75900.0,2.289327,1177326000000.0,20757.088678,76.970732,56719240.0,3.781463,16.376022,88595440000.0
378,1991,54.585019,71.659463,423894.199,-24301560000.0,4224.740106,82.634121,2645.673237,76679.0,2.304151,1242109000000.0,21884.104371,77.019512,56758520.0,4.669413,20.846513,72254310000.0
642,1992,54.326612,77.720025,420535.227,-29279160000.0,4310.537968,81.736186,2627.340008,77458.0,2.211071,1315807000000.0,23166.804063,77.419512,56797090.0,4.839463,20.553919,49861590000.0
906,1993,54.095406,77.720025,411917.777,7732703000.0,4303.645312,80.938751,2611.990438,78237.0,2.174555,1061445000000.0,18676.952579,77.721951,56831820.0,4.771495,20.579763,53589810000.0
1170,1994,53.388188,77.720025,407378.031,13199670000.0,4439.319253,79.870484,2578.465978,79016.0,1.593084,1095591000000.0,19273.84417,77.921951,56843400.0,5.248022,21.089447,57817130000.0
1434,1995,52.133555,77.720025,430483.798,25096130000.0,4597.892598,81.512308,2799.375762,79795.0,1.262211,1170787000000.0,20596.388576,78.170732,56844300.0,4.462278,17.466002,60690110000.0
1698,1996,52.187957,77.720025,425830.375,39973600000.0,4658.700157,80.960961,2796.154736,80574.0,1.237706,1308929000000.0,23020.09994,78.521951,56860280.0,4.748914,19.308014,70566440000.0
1962,1997,52.174357,77.720025,430487.465,32276690000.0,4791.285246,81.145993,2834.402946,81353.0,1.434947,1239051000000.0,21779.624367,78.82439,56890370.0,4.83142,18.743003,75086490000.0
2226,1998,52.646969,77.720025,440153.677,20107130000.0,4955.721944,81.728602,2912.947365,82132.0,1.111574,1266309000000.0,22252.358086,78.97561,56906740.0,4.757625,18.322525,53879700000.0
2490,1999,53.717997,77.720025,441998.178,14006570000.0,5079.492406,82.639022,2957.311486,82911.0,1.205071,1248563000000.0,21936.823129,79.42439,56916320.0,5.02246,19.793254,45301900000.0
