## 1. Import Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn import metrics

In [2]:
df_217 = pd.read_csv('217_countries_population.csv', index_col=0)

In [3]:
df_217

Unnamed: 0,Country Name,Country Code,Year,Birth Rate,Death_Rate,Population_65_above,population_growth_rate,Population_male,Population_female,Population_total,Life_expectancy_in_years
0,Aruba,ABW,2000,14.427,6.335,6165.0,2.539234,42833,46269,89101,73.569
1,Afghanistan,AFG,2000,49.664,12.096,446519.0,1.443803,9815442,9727541,19542982,55.298
2,Angola,AGO,2000,47.647,18.287,397810.0,3.244121,8054751,8339311,16394062,46.024
3,Albania,ALB,2000,17.076,5.798,241623.0,-0.637357,1531486,1557540,3089027,75.404
4,Andorra,AND,2000,11.300,0.000,8406.0,0.670960,34285,31812,66097,0.000
...,...,...,...,...,...,...,...,...,...,...,...
4769,Kosovo,XKX,2021,11.143,7.220,177017.0,-0.229016,890237,895801,1786038,76.806
4770,"Yemen, Rep.",YEM,2021,30.544,6.845,886889.0,2.137790,16668432,16313210,32981641,63.753
4771,South Africa,ZAF,2021,19.821,11.432,3546983.0,0.998920,28894608,30497646,59392255,62.341
4772,Zambia,ZMB,2021,34.511,6.973,338624.0,2.840806,9609004,9864121,19473125,61.223


* We take the data of France to create a model to predict French population

In [4]:
df_FR = df_217.loc[df_217['Country Name'] == 'France',['Year','Birth Rate', 'Death_Rate', 'population_growth_rate', 'Population_total']]
df_FR

Unnamed: 0,Year,Birth Rate,Death_Rate,population_growth_rate,Population_total
64,2000,13.3,8.9,0.686783,60921384
281,2001,13.1,8.8,0.729431,61367388
498,2002,12.9,8.8,0.728746,61816234
715,2003,12.8,9.1,0.710448,62256970
932,2004,12.8,8.3,0.735098,62716306
1149,2005,12.8,8.5,0.749918,63188395
1366,2006,13.1,8.3,0.693707,63628261
1583,2007,12.8,8.3,0.616494,64021737
1800,2008,12.9,8.5,0.557564,64379696
2017,2009,12.8,8.5,0.513103,64710879


In [5]:
df_FR.describe(include='all')

Unnamed: 0,Year,Birth Rate,Death_Rate,population_growth_rate,Population_total
count,22.0,22.0,22.0,22.0,22.0
mean,2010.5,12.359091,8.790909,0.514103,64883260.0
std,6.493587,0.753189,0.428629,0.172906,2137446.0
min,2000.0,10.9,8.3,0.263855,60921380.0
25%,2005.25,11.85,8.5,0.356301,63298360.0
50%,2010.5,12.75,8.75,0.502962,65187900.0
75%,2015.75,12.875,9.05,0.691976,66680150.0
max,2021.0,13.3,9.9,0.749918,67764300.0


In [6]:
df_FR.isnull().any()

Year                      False
Birth Rate                False
Death_Rate                False
population_growth_rate    False
Population_total          False
dtype: bool

## 2. Preprocessing

*  First Model- 3 features: Birth rate, Death rate and Population growth rate

In [7]:
# Separate target vatiable from features 
feature_list = df_FR.columns[1: -1]
X = df_FR.loc[:,feature_list]
Y = df_FR.loc[:,'Population_total']

In [8]:
# split dataset into train set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [9]:
# Standardizing numeric features on train set
feature_encoder = ColumnTransformer(transformers=[('num', StandardScaler(), feature_list)])
X_train= feature_encoder.fit_transform(X_train)

In [10]:
print(X_train[0:5,:])

[[-2.01992249  2.48641679 -1.46257607]
 [ 0.4623919  -0.99141935 -0.21067418]
 [-1.60620343  0.88126165 -1.01911667]
 [ 0.7382046  -0.7238935   0.21762321]
 [ 1.01401731 -1.25894521  0.99644554]]


In [11]:
#Standardzing numeric features on Test set
X_test = feature_encoder.transform(X_test)

In [12]:
print(X_test[:5])

[[-2.01992249  3.0214685  -1.41969569]
 [ 0.7382046  -0.7238935  -0.15274516]
 [ 0.04867283 -0.99141935 -0.2933283 ]
 [ 0.04867283 -0.18884178 -0.01706029]
 [ 1.01401731  0.07868408  1.20081019]]


## 3. Build models

In [13]:
# Train Model
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

In [14]:
#Prediction on Traing set
Y_train_predict = regressor.predict(X_train)
print(Y_train_predict[:5])

[67360702.10134804 65218888.46365892 67450839.105946   64112321.79820926
 63133986.05692938]


In [15]:
# Prediction on test set
Y_test_predict = regressor.predict(X_test)
print(Y_test_predict[:5])

[66907574.94774146 64604657.53844152 65837505.07867579 64876069.72894011
 61872007.15082061]


In [16]:
#Evaluate model
print('Frist model')
print("R2 score on training set is :", regressor.score(X_train, Y_train))
print("R2 score on test set is : ", regressor.score(X_test, Y_test))

Frist model
R2 score on training set is : 0.9543851411568954
R2 score on test set is :  0.8932672878264877


* Second Model- two features: Birth rate & Death rate

In [17]:
# try another model only use 2 features: birth rate and death rate, since population growth rate is quite correlated with birth & death rate
feature_list2 = df_FR.columns[1: -2]
X2 = df_FR.loc[:,feature_list]
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y, test_size=0.2, random_state=0)
feature_encoder2 = ColumnTransformer(transformers=[('num', StandardScaler(), feature_list2)])
X2_train= feature_encoder2.fit_transform(X2_train)
X2_test = feature_encoder2.transform(X2_test)
regressor.fit(X2_train, Y2_train)
Y2_train_predict = regressor.predict(X2_train)
Y2_test_predict = regressor.predict(X2_test)

In [18]:
print('Second model')
print("R2 score on training set is :", regressor.score(X2_train, Y2_train))
print("R2 score on test set is : ", regressor.score(X2_test, Y2_test))

Second model
R2 score on training set is : 0.8548453841466994
R2 score on test set is :  0.778023239844316


 * Third Model- 3 features: Year, Birth rate & Death rate

In [19]:
# try third model adding Year as a feature
feature_list3 = df_FR.columns[: -2]
X3 = df_FR.loc[:,feature_list3]
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3, Y, test_size=0.2, random_state=0)
feature_encoder3 = ColumnTransformer(transformers=[('num', StandardScaler(), feature_list3)])
X3_train= feature_encoder3.fit_transform(X3_train)
X3_test = feature_encoder3.transform(X3_test)
regressor.fit(X3_train, Y3_train)
Y3_train_predict = regressor.predict(X3_train)
Y3_test_predict = regressor.predict(X3_test)

In [20]:
print('Third model')
print("R2 score on training set is :", regressor.score(X3_train, Y3_train))
print("R2 score on test set is : ", regressor.score(X3_test, Y3_test))

Third model
R2 score on training set is : 0.997129576914723
R2 score on test set is :  0.9953082088657151


## 4. Assessement of models' performance by RMSE

In [21]:
print ("First model's RMSE is ", np.sqrt(metrics.mean_squared_error(Y_test, Y_test_predict)))

First model's RMSE is  688462.0279270854


In [22]:
print ("Second model's RMSE is ", np.sqrt(metrics.mean_squared_error(Y2_test, Y2_test_predict)))

Second model's RMSE is  992853.0477199681


In [23]:
print ("Third model's RMSE is ", np.sqrt(metrics.mean_squared_error(Y3_test, Y3_test_predict)))

Third model's RMSE is  144344.60427487188


### The Winner is the third model with 3 features: Year, Birth rate & Death rate .