## 1. Import Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn import metrics

In [3]:
df_217 = pd.read_csv('217_countries_population.csv', index_col=0)

In [4]:
df_217

Unnamed: 0,Country Name,Country Code,Year,Birth Rate,Death_Rate,Population_65_above,population_growth_rate,Population_male,Population_female,Population_total,Life_expectancy_in_years
0,Aruba,ABW,2000,14.427,6.335,6165.0,2.539234,42833,46269,89101,73.569
1,Afghanistan,AFG,2000,49.664,12.096,446519.0,1.443803,9815442,9727541,19542982,55.298
2,Angola,AGO,2000,47.647,18.287,397810.0,3.244121,8054751,8339311,16394062,46.024
3,Albania,ALB,2000,17.076,5.798,241623.0,-0.637357,1531486,1557540,3089027,75.404
4,Andorra,AND,2000,11.300,0.000,8406.0,0.670960,34285,31812,66097,0.000
...,...,...,...,...,...,...,...,...,...,...,...
4769,Kosovo,XKX,2021,11.143,7.220,177017.0,-0.229016,890237,895801,1786038,76.806
4770,"Yemen, Rep.",YEM,2021,30.544,6.845,886889.0,2.137790,16668432,16313210,32981641,63.753
4771,South Africa,ZAF,2021,19.821,11.432,3546983.0,0.998920,28894608,30497646,59392255,62.341
4772,Zambia,ZMB,2021,34.511,6.973,338624.0,2.840806,9609004,9864121,19473125,61.223


In [5]:
df_217.describe(include='all')

Unnamed: 0,Country Name,Country Code,Year,Birth Rate,Death_Rate,Population_65_above,population_growth_rate,Population_male,Population_female,Population_total,Life_expectancy_in_years
count,4774,4774,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0,4774.0
unique,217,217,,,,,,,,,
top,Aruba,ABW,,,,,,,,,
freq,22,22,,,,,,,,,
mean,,,2010.5,21.140549,7.987881,2578611.0,1.317821,16217990.0,16022700.0,32240690.0,68.16329
std,,,6.344953,11.131876,3.33239,10682470.0,1.566314,65997550.0,62770120.0,128754300.0,15.243362
min,,,2000.0,0.0,0.0,135.0,-6.852118,4769.0,4822.0,9609.0,0.0
25%,,,2005.0,11.9,5.829,35075.25,0.372536,369076.5,333887.2,708748.8,63.72
50%,,,2010.5,18.231,7.5,335301.5,1.201967,2855405.0,2792258.0,5654036.0,72.066427
75%,,,2016.0,29.05,9.71,1343373.0,2.21431,10172000.0,10287730.0,20531810.0,76.936439


In [6]:
df_217.isnull().any()

Country Name                False
Country Code                False
Year                        False
Birth Rate                  False
Death_Rate                  False
Population_65_above         False
population_growth_rate      False
Population_male             False
Population_female           False
Population_total            False
Life_expectancy_in_years    False
dtype: bool

In [7]:
df_217['Birth Rate'].value_counts()

0.000     66
9.800     35
10.200    35
9.400     35
10.300    34
          ..
39.801     1
42.086     1
40.911     1
39.767     1
30.537     1
Name: Birth Rate, Length: 3354, dtype: int64

In [8]:
df_217['Life_expectancy_in_years'].value_counts()

0.000000     156
81.404878      4
73.569000      3
82.860976      3
81.451220      3
            ... 
79.641463      1
63.553000      1
80.504878      1
81.434146      1
59.253000      1
Name: Life_expectancy_in_years, Length: 4318, dtype: int64

In [9]:
df_217['Death_Rate'].value_counts()

0.000     71
9.500     29
9.900     28
10.200    27
9.800     27
          ..
3.625      1
10.220     1
8.457      1
6.630      1
9.057      1
Name: Death_Rate, Length: 2983, dtype: int64

In [10]:
df_217['Population_total'].value_counts()

68742        2
34056        2
127445000    2
19372014     1
4516500      1
            ..
9547082      1
29996        1
23708320     1
3860158      1
15993524     1
Name: Population_total, Length: 4771, dtype: int64

* Delete unvalid rows which value =0

In [11]:
df = df_217.drop(df_217[(df_217['Birth Rate'] == 0.000)| (df_217['Death_Rate'] == 0.000) | (df_217['Life_expectancy_in_years'] == 0.000000)].index)

In [12]:
df

Unnamed: 0,Country Name,Country Code,Year,Birth Rate,Death_Rate,Population_65_above,population_growth_rate,Population_male,Population_female,Population_total,Life_expectancy_in_years
0,Aruba,ABW,2000,14.427,6.335,6165.0,2.539234,42833,46269,89101,73.569
1,Afghanistan,AFG,2000,49.664,12.096,446519.0,1.443803,9815442,9727541,19542982,55.298
2,Angola,AGO,2000,47.647,18.287,397810.0,3.244121,8054751,8339311,16394062,46.024
3,Albania,ALB,2000,17.076,5.798,241623.0,-0.637357,1531486,1557540,3089027,75.404
5,United Arab Emirates,ARE,2000,16.660,2.107,29399.0,5.580387,2237588,1037745,3275333,74.380
...,...,...,...,...,...,...,...,...,...,...,...
4769,Kosovo,XKX,2021,11.143,7.220,177017.0,-0.229016,890237,895801,1786038,76.806
4770,"Yemen, Rep.",YEM,2021,30.544,6.845,886889.0,2.137790,16668432,16313210,32981641,63.753
4771,South Africa,ZAF,2021,19.821,11.432,3546983.0,0.998920,28894608,30497646,59392255,62.341
4772,Zambia,ZMB,2021,34.511,6.973,338624.0,2.840806,9609004,9864121,19473125,61.223


In [13]:
df.describe(include='all')

Unnamed: 0,Country Name,Country Code,Year,Birth Rate,Death_Rate,Population_65_above,population_growth_rate,Population_male,Population_female,Population_total,Life_expectancy_in_years
count,4618,4618,4618.0,4618.0,4618.0,4618.0,4618.0,4618.0,4618.0,4618.0,4618.0
unique,212,212,,,,,,,,,
top,Aruba,ABW,,,,,,,,,
freq,22,22,,,,,,,,,
mean,,,2010.48874,21.623967,8.144531,2665537.0,1.355802,16765000.0,16563110.0,33328110.0,70.465904
std,,,6.343122,10.935192,3.206438,10850790.0,1.534997,67034980.0,63751680.0,130773100.0,8.82724
min,,,2000.0,5.0,0.795,135.0,-6.852118,4769.0,4822.0,9609.0,41.957
25%,,,2005.0,12.12575,5.96125,46600.75,0.411579,521899.5,485449.8,1040748.0,64.81575
50%,,,2010.0,18.7955,7.5755,365313.0,1.223175,3047855.0,3067532.0,6169620.0,72.415
75%,,,2016.0,29.39975,9.8,1384682.0,2.23441,10703650.0,10845250.0,21571470.0,77.094


In [14]:
df['Life_expectancy_in_years'].value_counts()

81.404878    4
73.569000    3
82.860976    3
81.451220    3
78.539024    3
            ..
79.641463    1
63.553000    1
80.504878    1
81.434146    1
59.253000    1
Name: Life_expectancy_in_years, Length: 4317, dtype: int64

In [15]:
df['Death_Rate'].value_counts()

9.500     28
9.700     27
9.800     27
9.100     26
9.900     26
          ..
6.501      1
3.625      1
10.220     1
8.457      1
9.057      1
Name: Death_Rate, Length: 2978, dtype: int64

In [16]:
df['Birth Rate'].value_counts()

10.200    35
9.400     34
9.600     33
9.800     33
10.300    32
          ..
11.441     1
39.801     1
42.086     1
40.911     1
30.537     1
Name: Birth Rate, Length: 3349, dtype: int64

In [17]:
corr_matrix = df[['Year','Birth Rate', 'Death_Rate', 'Population_total', 'Life_expectancy_in_years']].corr()
corr_matrix

Unnamed: 0,Year,Birth Rate,Death_Rate,Population_total,Life_expectancy_in_years
Year,1.0,-0.140697,-0.092812,0.019906,0.192056
Birth Rate,-0.140697,1.0,0.198335,-0.044303,-0.859163
Death_Rate,-0.092812,0.198335,1.0,-0.016224,-0.501371
Population_total,0.019906,-0.044303,-0.016224,1.0,0.006579
Life_expectancy_in_years,0.192056,-0.859163,-0.501371,0.006579,1.0


## 2. Preprocessing

*   We define 4 features: Country, Year, Birth rate, Death rate

In [18]:
# Separate target vatiable from features 
feature_list =['Country Name','Year','Birth Rate', 'Death_Rate']
X = df.loc[:,feature_list]
y = df.loc[:,'Population_total']

In [19]:
# split dataset into train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=X['Year'])

In [20]:
# Standardizing features on train set
numeric_features = ['Birth Rate', 'Death_Rate']
numeric_transformer = StandardScaler()
categorial_features = ['Country Name','Year']
categorial_transformer = OneHotEncoder(categories='auto', handle_unknown = 'ignore',drop='first')
feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat',categorial_transformer, categorial_features)])
X_train= feature_encoder.fit_transform(X_train)

In [21]:
print(X_train[0:5,:])

  (0, 0)	-0.6678535732027909
  (0, 1)	-0.7791738227895973
  (0, 5)	1.0
  (0, 223)	1.0
  (1, 0)	0.9184617942083184
  (1, 1)	3.204816628238733
  (1, 62)	1.0
  (1, 216)	1.0
  (2, 0)	0.4216905152644803
  (2, 1)	-1.4142121500950229
  (2, 98)	1.0
  (2, 225)	1.0
  (3, 0)	0.07487582559430289
  (3, 1)	0.5296551706922545
  (3, 133)	1.0
  (3, 215)	1.0
  (4, 0)	-1.0629438417045012
  (4, 1)	1.8697360503603973
  (4, 114)	1.0
  (4, 230)	1.0


In [22]:
#Standardzing features on Test set
X_test = feature_encoder.transform(X_test)

In [23]:
print(X_test[:5])

  (0, 0)	-0.9532932385520443
  (0, 1)	-0.9685602530785186
  (0, 26)	1.0
  (0, 219)	1.0
  (1, 0)	0.39475878817440313
  (1, 1)	-0.7672981060388069
  (1, 57)	1.0
  (1, 228)	1.0
  (2, 0)	-0.9346975222279432
  (2, 1)	1.1196907818894217
  (2, 130)	1.0
  (2, 232)	1.0
  (3, 0)	-0.6860828714712446
  (3, 1)	-0.5082199695544576
  (3, 12)	1.0
  (3, 224)	1.0
  (4, 0)	0.11490699817377142
  (4, 1)	-1.0079376296732447
  (4, 79)	1.0
  (4, 231)	1.0


## 3. Build model

In [24]:
# Train Model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [25]:
#Prediction on Traing set
y_train_predict = regressor.predict(X_train)
print(y_train_predict[:5])

[  527145.1424497  -3952589.4652236   8494684.36945752 46346710.47421503
  3764970.93055778]


In [26]:
# Prediction on test set
y_test_predict = regressor.predict(X_test)
print(y_test_predict[:5])

[-1406888.1978389  88770069.44514641  2715081.42163979   844740.97258611
 19467100.4638059 ]


## 4. Evaluate model

In [27]:
print("R2 score on training set is :", regressor.score(X_train, y_train))
print("R2 score on test set is : ", regressor.score(X_test, y_test))

R2 score on training set is : 0.9952052302158636
R2 score on test set is :  0.9961633260433176


----------------------------------

## 5. Model validation- Can we use the above global model to predict a certain country?

In [48]:
df_FR = df_217.loc[df_217['Country Name'] == 'France',['Country Name','Year','Birth Rate', 'Death_Rate', 'Population_total']]
df_FR_features = df_FR.drop(['Population_total'], axis=1)
X_features_FR = feature_encoder.transform(df_FR_features)
FR_test_predict = regressor.predict(X_features_FR)
FR_test_predict

array([62303174.78344794, 62684175.74514675, 62351571.36748311,
       62847060.2093984 , 63422204.10946828, 63204679.33086744,
       63506673.88502479, 63988481.87253939, 64343539.55732952,
       64834396.76906151, 64809981.99232632, 65274326.80314748,
       65071215.32908689, 65603664.77908929, 66214218.54447912,
       66362166.99934459, 66422618.90945493, 66724180.98321708,
       66468000.30378751, 67703343.92861879, 67584732.67816494,
       66917684.00199892])

In [49]:
#Evaluate the result by RMSE
np.sqrt(metrics.mean_squared_error(df_FR['Population_total'], FR_test_predict))

557454.7539881794

In [35]:
df_M = df_217.loc[df_217['Country Name'] == 'Malaysia',['Country Name','Year','Birth Rate', 'Death_Rate', 'Population_total']]

In [36]:
df_M

Unnamed: 0,Country Name,Year,Birth Rate,Death_Rate,Population_total
138,Malaysia,2000,23.047,4.584,22945150
355,Malaysia,2001,21.523,4.435,23542517
572,Malaysia,2002,20.091,4.43,24142445
789,Malaysia,2003,19.339,4.43,24739411
1006,Malaysia,2004,18.4,4.488,25333247
1223,Malaysia,2005,17.722,4.516,25923536
1440,Malaysia,2006,17.308,4.603,26509413
1657,Malaysia,2007,17.077,4.705,27092604
1874,Malaysia,2008,17.36,4.817,27664296
2091,Malaysia,2009,17.348,4.96,28217204


In [37]:
df_M_features = df_M.drop(['Population_total'], axis=1)

In [38]:
df_M_features

Unnamed: 0,Country Name,Year,Birth Rate,Death_Rate
138,Malaysia,2000,23.047,4.584
355,Malaysia,2001,21.523,4.435
572,Malaysia,2002,20.091,4.43
789,Malaysia,2003,19.339,4.43
1006,Malaysia,2004,18.4,4.488
1223,Malaysia,2005,17.722,4.516
1440,Malaysia,2006,17.308,4.603
1657,Malaysia,2007,17.077,4.705
1874,Malaysia,2008,17.36,4.817
2091,Malaysia,2009,17.348,4.96


In [39]:
X_features = feature_encoder.transform(df_M_features)

In [40]:
X_features

<22x234 sparse matrix of type '<class 'numpy.float64'>'
	with 87 stored elements in Compressed Sparse Row format>

In [41]:
M_test_predict = regressor.predict(X_features)

In [42]:
M_test_predict

array([23857134.62360374, 24971499.1447649 , 25300151.92150683,
       26300495.05032563, 26930681.88513078, 27165231.32627616,
       27699435.40778495, 28091218.79607461, 28394283.56655946,
       28763599.88788522, 28952099.84111898, 29161155.9990705 ,
       29098562.30238143, 29496779.91753051, 29885251.55311631,
       30133252.89324246, 30302410.62405849, 30712217.52726573,
       30528264.2970171 , 31882291.58458394, 32076974.11095116,
       31023726.50743725])

In [46]:
np.sqrt(metrics.mean_squared_error(df_M['Population_total'], M_test_predict))

1194159.1594084988

In [51]:
df_M['Predict']= M_test_predict
df_M

Unnamed: 0,Country Name,Year,Birth Rate,Death_Rate,Population_total,Predict
138,Malaysia,2000,23.047,4.584,22945150,23857130.0
355,Malaysia,2001,21.523,4.435,23542517,24971500.0
572,Malaysia,2002,20.091,4.43,24142445,25300150.0
789,Malaysia,2003,19.339,4.43,24739411,26300500.0
1006,Malaysia,2004,18.4,4.488,25333247,26930680.0
1223,Malaysia,2005,17.722,4.516,25923536,27165230.0
1440,Malaysia,2006,17.308,4.603,26509413,27699440.0
1657,Malaysia,2007,17.077,4.705,27092604,28091220.0
1874,Malaysia,2008,17.36,4.817,27664296,28394280.0
2091,Malaysia,2009,17.348,4.96,28217204,28763600.0


In [52]:
df_FR['Predict'] = FR_test_predict
df_FR

Unnamed: 0,Country Name,Year,Birth Rate,Death_Rate,Population_total,Predict
64,France,2000,13.3,8.9,60921384,62303170.0
281,France,2001,13.1,8.8,61367388,62684180.0
498,France,2002,12.9,8.8,61816234,62351570.0
715,France,2003,12.8,9.1,62256970,62847060.0
932,France,2004,12.8,8.3,62716306,63422200.0
1149,France,2005,12.8,8.5,63188395,63204680.0
1366,France,2006,13.1,8.3,63628261,63506670.0
1583,France,2007,12.8,8.3,64021737,63988480.0
1800,France,2008,12.9,8.5,64379696,64343540.0
2017,France,2009,12.8,8.5,64710879,64834400.0


In [55]:
df_M.to_csv("Malaysie prediction.csv")

In [56]:
df_FR.to_csv("France prediction.csv")