# Supervised Machine Learning Model/Linear Regression for BRICS_US Macro Data

In [445]:

#https://matplotlib.org/gallery/color/named_colors.html
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import math 

In [446]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

%matplotlib inline
from sklearn.svm import SVR

In [447]:
import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [448]:
from pandas_datareader import wb

In [449]:
import geopandas

# Part I: Regression for India Data

In [450]:
worldbank_indicators = ["NY.GDP.MKTP.KD.ZG", "NE.GDI.TOTL.KD.ZG","SL.EMP.TOTL.SP.ZS","TX.QTY.MRCH.XD.WD","SP.POP.GROW","EG.USE.PCAP.KG.OE"]
 
 
world_bank = wb.download (indicator =worldbank_indicators, country = ["IND"], start = 1995, end =2020)

world_bank = world_bank.reset_index(1)                        

In [451]:
world_bank_df = world_bank.rename(columns = {"NY.GDP.MKTP.KD.ZG":"GDP",\
                                               "NE.GDI.TOTL.KD.ZG":"Gross_Capital_Formation_Growth",\
                                               "SL.EMP.TOTL.SP.ZS":"Employment_Population_Ratio",\
                                            "TX.QTY.MRCH.XD.WD":"Exports",\
                                            "SP.POP.GROW":"Population_Growth",\
                                            "EG.USE.PCAP.KG.OE":"Energy_Use"})

In [452]:
world_bank_final = world_bank_df.loc[::-1]

In [453]:
world_bank_final.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
India,1995,7.574492,7.780467,54.91,66.922085,1.918941,385.091857
India,1996,7.549522,0.909082,54.77,75.840974,1.89522,389.426516
India,1997,4.049821,14.880201,54.63,70.603934,1.869172,397.378186
India,1998,6.184416,3.779184,54.49,73.737727,1.839659,399.498829
India,1999,8.845756,17.634883,54.34,83.327389,1.80556,414.962493


In [454]:
world_bank_final1 = world_bank_final.fillna({
    'Exports' : 500,
    'Energy_Use': 700})

In [455]:
world_bank_final1.to_excel('India.xlsx', index = False)

In [456]:
world_bank_final1.describe()

Unnamed: 0,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
count,26.0,26.0,26.0,26.0,26.0,26.0
mean,6.015853,7.858675,51.551923,265.855571,1.453925,528.203348
std,3.201107,8.94863,3.465178,156.988108,0.310959,119.585799
min,-7.251755,-10.107401,43.0,66.922085,0.989414,385.091857
25%,4.928303,3.700722,48.63,113.007412,1.154981,418.283314
50%,7.102806,7.627011,53.1,254.211635,1.487056,493.329632
75%,7.907675,14.679947,54.475,419.327275,1.718967,628.877385
max,8.845756,29.772987,54.91,500.0,1.918941,700.0


In [457]:
X = world_bank_final1.drop("GDP", axis = 1)
Y = world_bank_final1[['GDP']]


In [458]:
Y.shape


(26, 1)

In [459]:
# Split X and y into X_
#X_train, X_test, y_train, y_test = train_test_split(X, Y)


In [460]:
X.shape

(26, 6)

In [461]:
X_train = X[:20].values
X_test = X[20:].values
y_train = Y[:20].values
y_test = Y[20:].values

In [462]:
X_test

array([['2015', 4.72952352097509, 48.08, 422.4718463, 1.11689591278599,
        700.0],
       ['2016', 3.67456808127109, 47.64, 434.3545807, 1.09045932094686,
        700.0],
       ['2017', 10.8362897885911, 47.21, 461.2043143, 1.06335942989132,
        700.0],
       ['2018', 9.74553265704785, 46.79, 477.8606632, 1.03782784787162,
        700.0],
       ['2019', 2.28274228197319, 46.74, 491.4029668, 1.01326124930492,
        700.0],
       ['2020', -10.1074012471816, 43.0, 500.0, 0.989413800188014, 700.0]],
      dtype=object)

In [463]:
y_test

array([[ 7.99625379],
       [ 8.2563055 ],
       [ 6.79538342],
       [ 6.53298901],
       [ 4.04155419],
       [-7.25175478]])

In [464]:
# create a Linear Regression model object
regression_model_India = LinearRegression()


In [465]:
X_test

array([['2015', 4.72952352097509, 48.08, 422.4718463, 1.11689591278599,
        700.0],
       ['2016', 3.67456808127109, 47.64, 434.3545807, 1.09045932094686,
        700.0],
       ['2017', 10.8362897885911, 47.21, 461.2043143, 1.06335942989132,
        700.0],
       ['2018', 9.74553265704785, 46.79, 477.8606632, 1.03782784787162,
        700.0],
       ['2019', 2.28274228197319, 46.74, 491.4029668, 1.01326124930492,
        700.0],
       ['2020', -10.1074012471816, 43.0, 500.0, 0.989413800188014, 700.0]],
      dtype=object)

In [466]:
# pass through the X_train & y_train data set
regression_model_India.fit(X_train, y_train)

LinearRegression()

In [467]:
# let's grab the coefficient of our model and the intercept
intercept = regression_model_India.intercept_[0]
coefficent = regression_model_India.coef_[0][0]

In [469]:
print("The intercept for our model is {:.4}".format(intercept))
print('-'*100)

The intercept for our model is 3.089e+03
----------------------------------------------------------------------------------------------------


In [470]:
# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model_India.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

The Coefficient for year is -1.6
The Coefficient for Gross_Capital_Formation_Growth is 0.097
The Coefficient for Employment_Population_Ratio is 2.1
The Coefficient for Exports is -0.0078
The Coefficient for Population_Growth is -3.4e+01
The Coefficient for Energy_Use is 0.079


In [471]:
# Get multiple predictions
y_predict = regression_model_India.predict(X_test)

# Show the first 5 predictions
y_predict[:5]

array([[10.52030294],
       [ 8.71753345],
       [ 7.63974891],
       [ 5.80725507],
       [ 4.11811312]])

In [472]:
import math
# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

MSE 1.69
MAE 1.03
RMSE 1.3


In [473]:
y_test

array([[ 7.99625379],
       [ 8.2563055 ],
       [ 6.79538342],
       [ 6.53298901],
       [ 4.04155419],
       [-7.25175478]])

In [474]:
y_predict

array([[10.52030294],
       [ 8.71753345],
       [ 7.63974891],
       [ 5.80725507],
       [ 4.11811312],
       [-5.72521592]])

In [475]:
regression_model_India.score(X_train, y_train)

0.5292128090093442

In [476]:
r2_score_India = r2_score(y_test, y_predict)
r2_score_India

0.9416071747965893

# Regression for USA Data

In [477]:
worldbank_indicators = ["NY.GDP.MKTP.KD.ZG", "NE.GDI.TOTL.KD.ZG","SL.EMP.TOTL.SP.ZS","TX.QTY.MRCH.XD.WD","SP.POP.GROW","EG.USE.PCAP.KG.OE"]
 
 
world_bank = wb.download (indicator =worldbank_indicators, country = ["USA"], start = 1995, end =2020)

world_bank = world_bank.reset_index(1)          

In [478]:
world_bank_df = world_bank.rename(columns = {"NY.GDP.MKTP.KD.ZG":"GDP",\
                                               "NE.GDI.TOTL.KD.ZG":"Gross_Capital_Formation_Growth",\
                                               "SL.EMP.TOTL.SP.ZS":"Employment_Population_Ratio",\
                                            "TX.QTY.MRCH.XD.WD":"Exports",\
                                            "SP.POP.GROW":"Population_Growth",\
                                            "EG.USE.PCAP.KG.OE":"Energy_Use"})

In [479]:
world_bank_final = world_bank_df.loc[::-1]

In [480]:
world_bank_final.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
United States,1995,2.684217,2.864561,62.11,69.331525,1.190787,7763.755106
United States,1996,3.772566,7.591728,62.4,73.153759,1.163412,7844.468266
United States,1997,4.381775,9.481793,63.01,80.10629,1.20396,7828.581096
United States,1998,4.481408,8.660396,63.31,84.165066,1.165715,7803.697605
United States,1999,4.753236,7.951288,63.56,87.531198,1.14834,7923.223893


In [481]:
world_bank_final1 = world_bank_final.fillna({
    "Gross_Capital_Formation_Growth":12.0, 
    'Exports' :160.0,
    'Energy_Use': 6800.0})
world_bank_final1.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
United States,1995,2.684217,2.864561,62.11,69.331525,1.190787,7763.755106
United States,1996,3.772566,7.591728,62.4,73.153759,1.163412,7844.468266
United States,1997,4.381775,9.481793,63.01,80.10629,1.20396,7828.581096
United States,1998,4.481408,8.660396,63.31,84.165066,1.165715,7803.697605
United States,1999,4.753236,7.951288,63.56,87.531198,1.14834,7923.223893


In [482]:
world_bank_final1.to_excel('USA.xlsx', index = False)

In [483]:
world_bank_final1.describe()

Unnamed: 0,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
count,26.0,26.0,26.0,26.0,26.0,26.0
mean,2.251511,3.058375,60.544231,121.970683,0.864981,7390.327178
std,1.941325,5.863005,2.210808,32.303388,0.229266,479.723261
min,-3.642014,-16.595895,56.31,69.331525,0.350911,6800.0
25%,1.766792,1.07048,58.4525,93.22702,0.72843,6880.420121
50%,2.54487,4.30647,61.3,122.504765,0.899182,7592.867228
75%,3.403789,7.218287,62.1325,154.051194,0.98337,7828.407404
max,4.753236,10.457543,63.77,167.697222,1.20396,8056.86385


In [484]:
X = world_bank_final1.drop("GDP", axis = 1)
Y = world_bank_final1[['GDP']]


In [485]:
X_train = X[:20].values
X_test = X[20:].values
y_train = Y[:20].values
y_test = Y[20:].values

In [486]:
X_test

array([['2015', 5.04799124347824, 58.73, 155.0967981, 0.73621730882542,
        6803.99660728499],
       ['2016', -0.473252554625759, 59.13, 154.7712254,
        0.724676067451429, 6800.0],
       ['2017', 3.74679542893868, 59.58, 161.0373425, 0.63264399508256,
        6800.0],
       ['2018', 5.10467700795469, 59.89, 167.6972224, 0.526435395564053,
        6800.0],
       ['2019', 3.32735318928567, 60.27, 166.9141484, 0.455381285963537,
        6800.0],
       ['2020', -3.95644835328591, 56.31, 160.0, 0.350911063312921,
        6800.0]], dtype=object)

In [487]:
# create a Linear Regression model object
regression_model_USA = LinearRegression()

In [488]:
# pass through the X_train & y_train data set
regression_model_USA.fit(X_train, y_train)

LinearRegression()

In [489]:
# let's grab the coefficient of our model and the intercept
intercept = regression_model_USA.intercept_[0]
coefficent = regression_model_USA.coef_[0][0]

In [490]:
print("The intercept for our model is {:.4}".format(intercept))
print('-'*100)

The intercept for our model is -420.4
----------------------------------------------------------------------------------------------------


In [491]:
# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model_USA.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

The Coefficient for year is 0.2
The Coefficient for Gross_Capital_Formation_Growth is 0.24
The Coefficient for Employment_Population_Ratio is 0.47
The Coefficient for Exports is -0.027
The Coefficient for Population_Growth is 1.8
The Coefficient for Energy_Use is -0.0005


In [492]:
# Get multiple predictions
y_predict = regression_model_USA.predict(X_test)

# Show the first 5 predictions
y_predict[:5]

array([[2.70940662],
       [1.73682139],
       [2.84120157],
       [3.14515422],
       [2.98152012]])

In [493]:
import math
# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

MSE 1.88
MAE 0.844
RMSE 1.37


In [494]:
regression_model_USA.score(X_train, y_train)

0.975923999233812

In [495]:
r2_score_USA = r2_score(y_test, y_predict)
r2_score_USA

0.6501675179594838

# Regression for China Data

In [496]:
worldbank_indicators = ["NY.GDP.MKTP.KD.ZG", "NE.GDI.TOTL.KD.ZG","SL.EMP.TOTL.SP.ZS","TX.QTY.MRCH.XD.WD","SP.POP.GROW","EG.USE.PCAP.KG.OE"] 
 
world_bank = wb.download (indicator =worldbank_indicators, country = ["CHN"], start = 1995, end =2020)

world_bank = world_bank.reset_index(1)        
  

In [499]:
world_bank_df = world_bank.rename(columns = {"NY.GDP.MKTP.KD.ZG":"GDP",\
                                               "NE.GDI.TOTL.KD.ZG":"Gross_Capital_Formation_Growth",\
                                               "SL.EMP.TOTL.SP.ZS":"Employment_Population_Ratio",\
                                            "TX.QTY.MRCH.XD.WD":"Exports",\
                                            "SP.POP.GROW":"Population_Growth",\
                                            "EG.USE.PCAP.KG.OE":"Energy_Use"})

In [500]:
world_bank_final = world_bank_df.loc[::-1]

In [501]:
world_bank_final.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
China,1995,10.953954,,76.4,56.859363,1.086509,866.834374
China,1996,9.922557,12.994646,76.15,56.647133,1.048142,881.653737
China,1997,9.23678,5.04826,75.86,67.917261,1.02345,871.756324
China,1998,7.845952,8.51841,75.6,70.884441,0.95955,869.358607
China,1999,7.661652,6.326913,75.2,77.447298,0.865851,878.524536


In [502]:
world_bank_final1 = world_bank_final.fillna({
    "Gross_Capital_Formation_Growth":8.0, 
    'Exports' :580.0,
    'Energy_Use': 2150.0})
world_bank_final1.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
China,1995,10.953954,8.0,76.4,56.859363,1.086509,866.834374
China,1996,9.922557,12.994646,76.15,56.647133,1.048142,881.653737
China,1997,9.23678,5.04826,75.86,67.917261,1.02345,871.756324
China,1998,7.845952,8.51841,75.6,70.884441,0.95955,869.358607
China,1999,7.661652,6.326913,75.2,77.447298,0.865851,878.524536


In [503]:
world_bank_final1.to_excel('China.xlsx', index = False)

In [504]:
world_bank_final1.describe()

Unnamed: 0,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
count,26.0,26.0,26.0,26.0,26.0,26.0
mean,8.779018,10.991079,70.043077,393.719316,0.649056,1579.833913
std,2.338477,5.806275,3.876595,261.642682,0.208208,551.762447
min,2.347514,3.560256,63.48,56.647133,0.225948,866.834374
25%,7.484736,6.879195,67.1375,116.935203,0.528318,942.811254
50%,8.811862,8.672674,69.12,409.024495,0.599589,1651.537574
75%,10.009162,15.014587,73.4725,634.775003,0.714372,2150.0
max,14.230861,23.790644,76.4,784.492573,1.086509,2224.354898


In [505]:
X = world_bank_final1.drop("GDP", axis = 1)
Y = world_bank_final1[['GDP']]


In [506]:
X_train = X[:20].values
X_test = X[20:].values
y_train = Y[:20].values
y_test = Y[20:].values

In [507]:
X_test

array([['2015', 3.56025625894819, 66.76, 680.590456, 0.581456146657648,
        2150.0],
       ['2016', 7.23926986101135, 66.48, 690.2470659, 0.573050906069647,
        2150.0],
       ['2017', 6.32395939581063, 66.14, 738.9275362, 0.605245013482969,
        2150.0],
       ['2018', 6.75917007998336, 65.76, 769.440752, 0.467672053461959,
        2150.0],
       ['2019', 3.98543867617695, 65.1, 784.4925731, 0.354740890170827,
        2150.0],
       ['2020', 5.21683592750635, 63.48, 580.0, 0.225947587112011,
        2150.0]], dtype=object)

In [508]:
y_test

array([[7.04132888],
       [6.84876221],
       [6.94720079],
       [6.74977383],
       [5.94971423],
       [2.34751357]])

In [509]:
# create a Linear Regression model object
regression_model_China = LinearRegression()

In [510]:
# pass through the X_train & y_train data set
regression_model_China.fit(X_train, y_train)

LinearRegression()

In [511]:
# Get multiple predictions
y_predict = regression_model_China.predict(X_test)

# Show the first 5 predictions
y_predict[:5]

array([[6.2068302 ],
       [4.97815968],
       [4.32969702],
       [5.22030962],
       [5.78141234]])

In [513]:
# let's grab the coefficient of our model and the intercept
intercept = regression_model_China.intercept_[0]
coefficent = regression_model_China.coef_[0][0]

In [514]:
print("The intercept for our model is {:.4}".format(intercept))
print('-'*100)

The intercept for our model is 3.891e+03
----------------------------------------------------------------------------------------------------


In [515]:
# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model_China.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

The Coefficient for year is -1.9
The Coefficient for Gross_Capital_Formation_Growth is -0.0011
The Coefficient for Employment_Population_Ratio is -1.2
The Coefficient for Exports is 0.025
The Coefficient for Population_Growth is -1.1e+01
The Coefficient for Energy_Use is 0.00017


In [516]:
import math
# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

MSE 2.24
MAE 1.19
RMSE 1.5


In [517]:
regression_model_China.score(X_train, y_train)

0.7856971478420963

In [518]:
r2_score_China =r2_score(y_test, y_predict)
r2_score_China


0.19103083326511194

# Regression for Brazil Data

In [519]:
worldbank_indicators = ["NY.GDP.MKTP.KD.ZG", "NE.GDI.TOTL.KD.ZG","SL.EMP.TOTL.SP.ZS","TX.QTY.MRCH.XD.WD","SP.POP.GROW","EG.USE.PCAP.KG.OE"] 
 
world_bank = wb.download (indicator =worldbank_indicators, country = ["BRA"], start = 1995, end =2020)

world_bank = world_bank.reset_index(1)       

In [520]:
world_bank_df = world_bank.rename(columns = {"NY.GDP.MKTP.KD.ZG":"GDP",\
                                               "NE.GDI.TOTL.KD.ZG":"Gross_Capital_Formation_Growth",\
                                               "SL.EMP.TOTL.SP.ZS":"Employment_Population_Ratio",\
                                            "TX.QTY.MRCH.XD.WD":"Exports",\
                                            "SP.POP.GROW":"Population_Growth",\
                                            "EG.USE.PCAP.KG.OE":"Energy_Use"})

In [521]:
world_bank_final = world_bank_df.loc[::-1]

In [522]:
world_bank_final.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Brazil,1995,4.223794,7.290004,59.68,72.158558,1.60971,994.289849
Brazil,1996,2.208864,0.798463,57.88,74.083146,1.588838,1030.668072
Brazil,1997,3.394846,8.418114,57.93,80.843315,1.563732,1066.186951
Brazil,1998,0.338098,-0.163841,57.32,82.890311,1.528962,1075.199081
Brazil,1999,0.467938,-8.875302,57.34,88.936536,1.48111,1083.403984


In [523]:
world_bank_final1 = world_bank_final.fillna({
    "Gross_Capital_Formation_Growth":0.6, 
    'Exports' :230.0,
    'Energy_Use': 800.0})
world_bank_final1.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Brazil,1995,4.223794,7.290004,59.68,72.158558,1.60971,994.289849
Brazil,1996,2.208864,0.798463,57.88,74.083146,1.588838,1030.668072
Brazil,1997,3.394846,8.418114,57.93,80.843315,1.563732,1066.186951
Brazil,1998,0.338098,-0.163841,57.32,82.890311,1.528962,1075.199081
Brazil,1999,0.467938,-8.875302,57.34,88.936536,1.48111,1083.403984


In [524]:
world_bank_final1.to_excel('Brazil.xlsx', index = False)

In [525]:
world_bank_final1.describe()

Unnamed: 0,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
count,26.0,26.0,26.0,26.0,26.0,26.0
mean,2.120676,2.062374,58.372308,164.349065,1.106152,1105.185877
std,2.853383,7.374636,2.038651,55.36371,0.29327,216.416964
min,-4.059048,-13.9465,51.14,72.158558,0.712873,800.0
25%,0.663174,-1.96135,57.4425,111.820248,0.860248,1003.384405
50%,2.06502,1.630388,58.795,185.784523,1.016088,1086.876916
75%,3.971314,6.790732,59.7225,196.584015,1.353404,1239.735885
max,7.528226,17.853921,60.82,241.30419,1.60971,1495.541141


In [526]:
X = world_bank_final1.drop("GDP", axis = 1)
Y = world_bank_final1[['GDP']]

In [527]:
X_train = X[:20].values
X_test = X[20:].values
y_train = Y[:20].values
y_test = Y[20:].values

In [528]:
X_test

array([['2015', -13.9465002050587, 58.65, 209.4169305, 0.838838953278811,
        800.0],
       ['2016', -12.1298264348974, 56.39, 216.1630581, 0.823752152538353,
        800.0],
       ['2017', -2.55735555667499, 55.84, 230.8527606, 0.807145245445964,
        800.0],
       ['2018', 5.23175830926645, 56.15, 241.3041896, 0.783844259734468,
        800.0],
       ['2019', 3.36410575283553, 56.76, 236.3478881, 0.751550842096973,
        800.0],
       ['2020', -0.776685637163737, 51.14, 230.0, 0.712872791114979,
        800.0]], dtype=object)

In [529]:
y_test

array([[-3.54576339],
       [-3.27591691],
       [ 1.32286905],
       [ 1.78366676],
       [ 1.41115299],
       [-4.05904827]])

In [530]:
# create a Linear Regression model object
regression_model_Brazil = LinearRegression()

In [531]:
# pass through the X_train & y_train data set
regression_model_Brazil.fit(X_train, y_train)

LinearRegression()

In [532]:
# Get multiple predictions
y_predict = regression_model_Brazil.predict(X_test)

# Show the first 5 predictions
y_predict[:5]

array([[ 3.7851174 ],
       [ 4.89420683],
       [ 8.58753226],
       [11.52274533],
       [10.9376773 ]])

In [534]:
# let's grab the coefficient of our model and the intercept
intercept = regression_model_Brazil.intercept_[0]
coefficent = regression_model_Brazil.coef_[0][0]

In [535]:
print("The intercept for our model is {:.4}".format(intercept))
print('-'*100)

The intercept for our model is -1.211e+03
----------------------------------------------------------------------------------------------------


In [536]:
# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model_Brazil.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

The Coefficient for year is 0.6
The Coefficient for Gross_Capital_Formation_Growth is 0.3
The Coefficient for Employment_Population_Ratio is 0.01
The Coefficient for Exports is 0.03
The Coefficient for Population_Growth is 1.5e+01
The Coefficient for Energy_Use is -0.008


In [537]:
import math
# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

MSE 90.3
MAE 9.26
RMSE 9.5


In [538]:
regression_model_Brazil.score(X_train, y_train)

0.8631682110626837

In [539]:
r2_score_Brazil =r2_score(y_test, y_predict)
r2_score_Brazil


-12.5597352721006

# Regression for South Africa Data

In [540]:
worldbank_indicators = ["NY.GDP.MKTP.KD.ZG", "NE.GDI.TOTL.KD.ZG","SL.EMP.TOTL.SP.ZS","TX.QTY.MRCH.XD.WD","SP.POP.GROW","EG.USE.PCAP.KG.OE"] 
 
world_bank = wb.download (indicator =worldbank_indicators, country = ["ZAF"], start = 1995, end =2020)

world_bank = world_bank.reset_index(1)       

In [541]:
world_bank_df = world_bank.rename(columns = {"NY.GDP.MKTP.KD.ZG":"GDP",\
                                               "NE.GDI.TOTL.KD.ZG":"Gross_Capital_Formation_Growth",\
                                               "SL.EMP.TOTL.SP.ZS":"Employment_Population_Ratio",\
                                            "TX.QTY.MRCH.XD.WD":"Exports",\
                                            "SP.POP.GROW":"Population_Growth",\
                                            "EG.USE.PCAP.KG.OE":"Energy_Use"})

In [542]:
world_bank_final = world_bank_df.loc[::-1]

In [543]:
world_bank_final.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
South Africa,1995,3.1,14.229407,39.34,75.526461,2.126182,2498.893625
South Africa,1996,4.3,1.027631,39.2,84.748864,1.924718,2504.903115
South Africa,1997,2.6,2.944509,39.09,91.578779,1.751687,2527.940174
South Africa,1998,0.5,3.340042,39.01,85.363103,1.603369,2471.477036
South Africa,1999,2.4,-3.129856,38.87,89.073329,1.491248,2464.32336


In [544]:
world_bank_final1 = world_bank_final.fillna({
    "Gross_Capital_Formation_Growth":1.6, 
    'Exports' :100.0,
    'Energy_Use': 2300.0})
world_bank_final1.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
South Africa,1995,3.1,14.229407,39.34,75.526461,2.126182,2498.893625
South Africa,1996,4.3,1.027631,39.2,84.748864,1.924718,2504.903115
South Africa,1997,2.6,2.944509,39.09,91.578779,1.751687,2527.940174
South Africa,1998,0.5,3.340042,39.01,85.363103,1.603369,2471.477036
South Africa,1999,2.4,-3.129856,38.87,89.073329,1.491248,2464.32336


In [545]:
world_bank_final1.to_excel('SouthAfrica.xlsx', index = False)

In [546]:
world_bank_final1.describe()

Unnamed: 0,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
count,26.0,26.0,26.0,26.0,26.0,26.0
mean,2.296737,2.81917,39.631923,114.306491,1.461051,2541.316674
std,2.470958,8.517843,1.453638,21.606507,0.219257,190.031628
min,-6.431975,-24.730526,36.69,75.526461,1.217762,2300.0
25%,1.344853,-0.624441,38.9125,99.912373,1.297697,2394.323134
50%,2.65,3.158226,39.545,110.369759,1.412985,2511.617954
75%,3.573042,8.756069,40.6375,131.951093,1.569076,2691.267888
max,5.603806,15.976655,43.1,147.420926,2.126182,2950.15361


In [547]:
X = world_bank_final1.drop("GDP", axis = 1)
Y = world_bank_final1[['GDP']]

In [548]:
X_train = X[:20].values
X_test = X[20:].values
y_train = Y[:20].values
y_test = Y[20:].values

In [549]:
X_test

array([['2015', 4.00533113024204, 41.45, 142.8619585, 1.53224277341692,
        2300.0],
       ['2016', -8.38581264256115, 40.79, 141.7059987, 1.47193342939738,
        2300.0],
       ['2017', 2.97641067709678, 40.99, 144.2538, 1.41694725378631,
        2300.0],
       ['2018', -0.69657782263657, 40.93, 147.4209256, 1.36370321600461,
        2300.0],
       ['2019', -0.408032219323644, 40.12, 142.4716299, 1.31629200967216,
        2300.0],
       ['2020', -24.7305255204144, 36.69, 100.0, 1.27335626330668,
        2300.0]], dtype=object)

In [550]:
y_test

array([[ 1.32186224],
       [ 0.66455231],
       [ 1.15794695],
       [ 1.48761737],
       [ 0.1130537 ],
       [-6.43197483]])

In [551]:
# create a Linear Regression model object
regression_model_SouthAfrica = LinearRegression()

In [552]:
# pass through the X_train & y_train data set
regression_model_SouthAfrica.fit(X_train, y_train)

LinearRegression()

In [553]:
# Get multiple predictions
y_predict = regression_model_SouthAfrica.predict(X_test)

# Show the first 5 predictions
y_predict[:5]

array([[1.71738746],
       [0.16625209],
       [1.2323233 ],
       [0.92100673],
       [0.23564837]])

In [555]:
# let's grab the coefficient of our model and the intercept
intercept = regression_model_SouthAfrica.intercept_[0]
coefficent = regression_model_SouthAfrica.coef_[0][0]

In [556]:
print("The intercept for our model is {:.4}".format(intercept))
print('-'*100)

The intercept for our model is 839.7
----------------------------------------------------------------------------------------------------


In [557]:
# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model_SouthAfrica.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

The Coefficient for year is -0.42
The Coefficient for Gross_Capital_Formation_Growth is 0.1
The Coefficient for Employment_Population_Ratio is -0.44
The Coefficient for Exports is 0.14
The Coefficient for Population_Growth is -0.6
The Coefficient for Energy_Use is 0.0025


In [558]:
import math
# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

MSE 0.185
MAE 0.377
RMSE 0.43


In [559]:
regression_model_SouthAfrica.score(X_train, y_train)

0.6468133294312179

In [560]:
r2_score_SouthAfrica =r2_score(y_test, y_predict)
r2_score_SouthAfrica

0.9762357050308849

# Regression for Russian Federation Data

In [561]:
worldbank_indicators = ["NY.GDP.MKTP.KD.ZG", "NE.GDI.TOTL.KD.ZG","SL.EMP.TOTL.SP.ZS","TX.QTY.MRCH.XD.WD","SP.POP.GROW","EG.USE.PCAP.KG.OE"] 
 
world_bank = wb.download (indicator =worldbank_indicators, country = ["RUS"], start = 1995, end =2020)

world_bank = world_bank.reset_index(1)       

In [562]:
world_bank_df = world_bank.rename(columns = {"NY.GDP.MKTP.KD.ZG":"GDP",\
                                               "NE.GDI.TOTL.KD.ZG":"Gross_Capital_Formation_Growth",\
                                               "SL.EMP.TOTL.SP.ZS":"Employment_Population_Ratio",\
                                            "TX.QTY.MRCH.XD.WD":"Exports",\
                                            "SP.POP.GROW":"Population_Growth",\
                                            "EG.USE.PCAP.KG.OE":"Energy_Use"})

In [563]:
world_bank_final = world_bank_df.loc[::-1]

In [564]:
world_bank_final.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Russian Federation,1995,-4.143528,-10.8,55.62,,-0.021649,4290.693467
Russian Federation,1996,-3.755069,-6.888909,54.51,,-0.145452,4252.611619
Russian Federation,1997,1.399916,-4.100188,51.95,,-0.165342,4069.687292
Russian Federation,1998,-5.299962,-45.199827,50.31,,-0.165486,3981.499468
Russian Federation,1999,6.399915,-6.600615,53.18,,-0.309278,4136.751144


In [566]:
world_bank_final1 = world_bank_final.fillna({
    "Gross_Capital_Formation_Growth":1.6, 
    'Exports' :150.0,
    'Energy_Use': 4000.0})
world_bank_final1.head()

Unnamed: 0_level_0,year,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Russian Federation,1995,-4.143528,-10.8,55.62,150.0,-0.021649,4290.693467
Russian Federation,1996,-3.755069,-6.888909,54.51,150.0,-0.145452,4252.611619
Russian Federation,1997,1.399916,-4.100188,51.95,150.0,-0.165342,4069.687292
Russian Federation,1998,-5.299962,-45.199827,50.31,150.0,-0.165486,3981.499468
Russian Federation,1999,6.399915,-6.600615,53.18,150.0,-0.309278,4136.751144


In [567]:
world_bank_final1.to_excel('Russia.xlsx', index = False)

In [568]:
world_bank_final1.describe()

Unnamed: 0,GDP,Gross_Capital_Formation_Growth,Employment_Population_Ratio,Exports,Population_Growth,Energy_Use
count,26.0,26.0,26.0,26.0,26.0,26.0
mean,2.563569,3.774255,56.781154,155.628475,-0.113188,4416.831743
std,4.645481,22.28703,2.490902,26.248143,0.229201,387.010604
min,-7.799994,-45.199827,50.31,100.0,-0.460024,3981.499468
25%,0.329334,-6.077411,55.29,150.0,-0.322904,4017.421823
50%,3.415666,1.307718,57.455,154.280139,-0.09751,4289.538049
75%,6.099928,13.775086,58.975,167.483361,0.06971,4704.481421
max,10.000067,75.201129,59.39,203.728337,0.217642,5167.010353


In [569]:
X = world_bank_final1.drop("GDP", axis = 1)
Y = world_bank_final1[['GDP']]

In [570]:
X_train = X[:20].values
X_test = X[20:].values
y_train = Y[:20].values
y_test = Y[20:].values

In [571]:
X_test

array([['2015', -11.74313147216, 59.14, 180.5162551, 0.192557946508244,
        4000.0],
       ['2016', -0.639176708239958, 59.28, 187.5903055,
        0.170245238698228, 4000.0],
       ['2017', 6.40962100599796, 59.08, 194.7974436, 0.106870569429971,
        4000.0],
       ['2018', -1.59119425116916, 59.23, 203.7283367,
        -0.0130668924933287, 4000.0],
       ['2019', 3.25461218970347, 58.66, 197.6251167,
        -0.0495686654330689, 4000.0],
       ['2020', -1.97334879957343, 57.62, 150.0, -0.209476796881133,
        4000.0]], dtype=object)

In [572]:
y_test

array([[-1.97271923],
       [ 0.19369007],
       [ 1.82579006],
       [ 2.80724541],
       [ 2.03298274],
       [-2.9512739 ]])

In [573]:
# create a Linear Regression model object
regression_model_Russia = LinearRegression()

In [574]:
# pass through the X_train & y_train data set
regression_model_Russia.fit(X_train, y_train)

LinearRegression()

In [575]:
# Get multiple predictions
y_predict = regression_model_Russia.predict(X_test)

# Show the first 5 predictions
y_predict[:5]

array([[-6.48660304],
       [-4.07050412],
       [-1.18017988],
       [ 0.78823007],
       [ 2.16271135]])

In [576]:
# let's grab the coefficient of our model and the intercept
intercept = regression_model_Russia.intercept_[0]
coefficent = regression_model_Russia.coef_[0][0]

In [577]:
print("The intercept for our model is {:.4}".format(intercept))
print('-'*100)

The intercept for our model is -419.5
----------------------------------------------------------------------------------------------------


In [578]:
# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model_Russia.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

The Coefficient for year is 0.2
The Coefficient for Gross_Capital_Formation_Growth is 0.12
The Coefficient for Employment_Population_Ratio is -0.64
The Coefficient for Exports is 0.075
The Coefficient for Population_Growth is -1.8e+01
The Coefficient for Energy_Use is 0.0075


In [579]:
import math
# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

MSE 12.3
MAE 3.1
RMSE 3.5


In [580]:
regression_model_Russia.score(X_train, y_train)

0.891333128350766

In [581]:
r2_score_Russia =r2_score(y_test, y_predict)
r2_score_Russia

-1.693545691835383