In [1]:
import pandas as pd
import numpy as np
import math
# For model evaluation
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# For model training
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
#load dataset into a dataframe
df = pd.read_csv('Datasets/LifeExpectancyData.csv')
#show top 5 rows of dataframe
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
#find missing values in all records
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [4]:
df.describe()

Unnamed: 0,Year,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
count,2938.0,2928.0,2928.0,2938.0,2744.0,2938.0,2385.0,2938.0,2904.0,2938.0,2919.0,2712.0,2919.0,2938.0,2490.0,2286.0,2904.0,2904.0,2771.0,2775.0
mean,2007.51872,69.224932,164.796448,30.303948,4.602861,738.251295,80.940461,2419.59224,38.321247,42.035739,82.550188,5.93819,82.324084,1.742103,7483.158469,12753380.0,4.839704,4.870317,0.627551,11.992793
std,4.613841,9.523867,124.292079,117.926501,4.052413,1987.914858,25.070016,11467.272489,20.044034,160.445548,23.428046,2.49832,23.716912,5.077785,14270.169342,61012100.0,4.420195,4.508882,0.210904,3.35892
min,2000.0,36.3,1.0,0.0,0.01,0.0,1.0,0.0,1.0,0.0,3.0,0.37,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,0.0
25%,2004.0,63.1,74.0,0.0,0.8775,4.685343,77.0,0.0,19.3,0.0,78.0,4.26,78.0,0.1,463.935626,195793.2,1.6,1.5,0.493,10.1
50%,2008.0,72.1,144.0,3.0,3.755,64.912906,92.0,17.0,43.5,4.0,93.0,5.755,93.0,0.1,1766.947595,1386542.0,3.3,3.3,0.677,12.3
75%,2012.0,75.7,228.0,22.0,7.7025,441.534144,97.0,360.25,56.2,28.0,97.0,7.4925,97.0,0.8,5910.806335,7420359.0,7.2,7.2,0.779,14.3
max,2015.0,89.0,723.0,1800.0,17.87,19479.91161,99.0,212183.0,87.3,2500.0,99.0,17.6,99.0,50.6,119172.7418,1293859000.0,27.7,28.6,0.948,20.7


In [5]:
#countries with zero value of life expentancy
empty_country_names = df[df['Life expectancy '].isna()]['Country']
empty_country_names.value_counts()

Cook Islands             1
Dominica                 1
Marshall Islands         1
Monaco                   1
Nauru                    1
Niue                     1
Palau                    1
Saint Kitts and Nevis    1
San Marino               1
Tuvalu                   1
Name: Country, dtype: int64

In [6]:
#strip white spaces in country row
df['Country'] = df['Country'].str.strip()
# show rows from empty_country_names only
[df[df['Country']==value] for value in empty_country_names]

[          Country  Year      Status  Life expectancy   Adult Mortality  \
 624  Cook Islands  2013  Developing               NaN              NaN   
 
      infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   \
 624              0     0.01                     0.0         98.0         0   
 
      ...  Polio  Total expenditure  Diphtheria    HIV/AIDS  GDP  Population  \
 624  ...   98.0               3.58         98.0        0.1  NaN         NaN   
 
       thinness  1-19 years   thinness 5-9 years  \
 624                    0.1                  0.1   
 
      Income composition of resources  Schooling  
 624                              NaN        NaN  
 
 [1 rows x 22 columns],
       Country  Year      Status  Life expectancy   Adult Mortality  \
 769  Dominica  2013  Developing               NaN              NaN   
 
      infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   \
 769              0     0.01               11.419555         96.0    

In [7]:
#convert the list of countries to a list
value_list = empty_country_names.index.tolist()
#print(value_list)
#summary of empty values of
df.loc[value_list].isnull().sum()

Country                             0
Year                                0
Status                              0
Life expectancy                    10
Adult Mortality                    10
infant deaths                       0
Alcohol                             1
percentage expenditure              0
Hepatitis B                         0
Measles                             0
 BMI                                2
under-five deaths                   0
Polio                               0
Total expenditure                   0
Diphtheria                          0
 HIV/AIDS                           0
GDP                                 5
Population                          8
 thinness  1-19 years               2
 thinness 5-9 years                 2
Income composition of resources     7
Schooling                           3
dtype: int64

In [8]:
#dropping the rows due to multple empty values
df = df.drop(value_list,axis=0).reset_index(drop=True)
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                      0
Adult Mortality                      0
infant deaths                        0
Alcohol                            193
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                32
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                443
Population                         644
 thinness  1-19 years               32
 thinness 5-9 years                 32
Income composition of resources    160
Schooling                          160
dtype: int64

In [9]:
#select developing countries data only
df = df[df['Status']== 'Developing']
#print(df)
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                      0
Adult Mortality                      0
infant deaths                        0
Alcohol                            165
percentage expenditure               0
Hepatitis B                        380
Measles                              0
 BMI                                32
under-five deaths                    0
Polio                               19
Total expenditure                  194
Diphtheria                          19
 HIV/AIDS                            0
GDP                                379
Population                         548
 thinness  1-19 years               32
 thinness 5-9 years                 32
Income composition of resources    112
Schooling                          112
dtype: int64

In [10]:
# fill missing values in alcohol column with the median of that country
df_alcohol = df.groupby(df['Country'])[['Alcohol']].median()
for row_ in df_alcohol.itertuples():
    for row in df.itertuples():
        if row.Country== row_[0]:
            med = row_.Alcohol
            df["Alcohol"].fillna(med, inplace=True)
        # df.fillna()

In [11]:
# fill missing values in bmi column with the median of that country
df_bmi = df.groupby(df['Country'])[[' BMI ']].median()
for row_ in df_bmi.itertuples():
    for row in df.itertuples():
        if row.Country== row_[0]:
            med = row_[1]
            # print(row)
            # print(med)
            df[" BMI "].fillna(med, inplace=True)
        # df.fillna()

In [12]:
# fill missing values in bmi column with the median of that country
df_gdp = df.groupby(df['Country'])[['GDP']].median()
for row_ in df_gdp.itertuples():
    for row in df.itertuples():
        if row.Country== row_[0]:
            med = row_[1]
            # print(row)
            # print(med)
            df["GDP"].fillna(med, inplace=True)
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                      0
Adult Mortality                      0
infant deaths                        0
Alcohol                              0
percentage expenditure               0
Hepatitis B                        380
Measles                              0
 BMI                                 0
under-five deaths                    0
Polio                               19
Total expenditure                  194
Diphtheria                          19
 HIV/AIDS                            0
GDP                                  0
Population                         548
 thinness  1-19 years               32
 thinness 5-9 years                 32
Income composition of resources    112
Schooling                          112
dtype: int64

In [13]:
# fill missing values in bmi column with the median of that country
df_schooling = df.groupby(df['Country'])[['Schooling']].median()
for row_ in df_schooling.itertuples():
    for row in df.itertuples():
        if row.Country== row_[0]:
            med = row_[1]
            # print(row)
            # print(med)
            df["Schooling"].fillna(med, inplace=True)
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                      0
Adult Mortality                      0
infant deaths                        0
Alcohol                              0
percentage expenditure               0
Hepatitis B                        380
Measles                              0
 BMI                                 0
under-five deaths                    0
Polio                               19
Total expenditure                  194
Diphtheria                          19
 HIV/AIDS                            0
GDP                                  0
Population                         548
 thinness  1-19 years               32
 thinness 5-9 years                 32
Income composition of resources    112
Schooling                            0
dtype: int64

In [14]:
# fill missing values in bmi column with the median of that country
df_schooling = df.groupby(df['Country'])[['Income composition of resources']].median()
for row_ in df_schooling.itertuples():
    for row in df.itertuples():
        if row.Country== row_[0]:
            med = row_[1]
            # print(row)
            # print(med)
            df["Income composition of resources"].fillna(med, inplace=True)
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                      0
Adult Mortality                      0
infant deaths                        0
Alcohol                              0
percentage expenditure               0
Hepatitis B                        380
Measles                              0
 BMI                                 0
under-five deaths                    0
Polio                               19
Total expenditure                  194
Diphtheria                          19
 HIV/AIDS                            0
GDP                                  0
Population                         548
 thinness  1-19 years               32
 thinness 5-9 years                 32
Income composition of resources      0
Schooling                            0
dtype: int64

In [15]:
df = df[[' BMI ','Adult Mortality', 'Income composition of resources', 'Schooling', 'Life expectancy ']]
# ['Country',
#  'Year',
#  'Status',
#  'Life expectancy ',
#  'Adult Mortality',
#  'infant deaths',
#  'Alcohol',
#  'percentage expenditure',
#  'Hepatitis B',
#  'Measles ',
#  ' BMI ',
#  'under-five deaths ',
#  'Polio',
#  'Total expenditure',
#  'Diphtheria ',
#  ' HIV/AIDS',
#  'GDP',
#  'Population',
#  ' thinness  1-19 years',
#  ' thinness 5-9 years',
#  'Income composition of resources',
#  'Schooling']
df.isna().sum()

 BMI                               0
Adult Mortality                    0
Income composition of resources    0
Schooling                          0
Life expectancy                    0
dtype: int64

In [16]:
# renaming columns
df_trim = df.rename(columns={'Life expectancy ':'life_expectancy', 
                                                          ' BMI ':'bmi',  
                                                          'Income composition of resources':'income_comp_res', 
                                                          'Schooling':'schooling',
                                                         'Adult Mortality':'adult_mortality'})
df_trim.head()

Unnamed: 0,bmi,adult_mortality,income_comp_res,schooling,life_expectancy
0,19.1,263.0,0.479,10.1,65.0
1,18.6,271.0,0.476,10.0,59.9
2,18.1,268.0,0.47,9.9,59.9
3,17.6,272.0,0.463,9.8,59.5
4,17.2,275.0,0.454,9.5,59.2


In [17]:
# Train test split
training_data = df_trim.sample(frac=0.7, random_state=25)
testing_data = df_trim.drop(training_data.index)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 1691
No. of testing examples: 725


In [18]:
# Separate target from features
y = training_data['life_expectancy']

features = training_data.drop(['life_expectancy'], axis=1)

# Remove target variable from test data
y_testing = testing_data['life_expectancy']
testing_data.drop(['life_expectancy'], axis=1, inplace=True)

# Preview features
features.head()

Unnamed: 0,bmi,adult_mortality,income_comp_res,schooling
1954,56.3,118.0,0.773,12.9
728,17.8,299.0,0.424,8.55
2687,53.5,143.0,0.641,10.7
1302,49.7,142.0,0.721,12.7
402,18.2,27.0,0.392,7.5


In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(features, y, random_state=0)
#Define the model (Multivariate) 
regr = LinearRegression()

# Train the model
model = regr.fit(X_train, y_train)
#coefficeints
model.coef_

array([ 0.08740194, -0.0318224 ,  6.24597661,  0.80884827])

In [20]:
#y-intercept
model.intercept_

57.35543928847564

In [21]:
#predict
preds_valid = model.predict(X_valid)

print(mean_squared_error(y_valid, preds_valid, squared=False))

5.122379623771172


In [24]:
#predict testing data and find  mean squqred error
preds_test = model.predict(testing_data)

mse = mean_squared_error(y_testing, preds_test, squared=False)
print(math.sqrt(mse))
print(mse)

2.2495856942173442
5.060635795627331


In [23]:
# predicting the accuracy score
score=r2_score(y_testing, preds_test)
print('r2 score is ',score)

r2 score is  0.6802763422693132


In [29]:
print(preds_test[:50])

[61.81671547 61.35221808 60.6273246  60.17620369 58.19953651 57.1175082
 55.29666219 74.73812772 73.50690272 76.0805195  68.2800229  74.2535154
 74.15836957 73.8167351  73.69051733 78.23940483 75.04142131 68.01583848
 71.74107279 69.53935426 60.47758621 58.3979258  56.12900084 55.3875115
 63.90350737 63.28061907 73.42194073 73.29960879 73.10698227 77.14560565
 78.08604701 81.35147183 76.82531362 76.59861085 76.2464169  75.81089963
 73.29940431 72.94669885 72.07579871 70.54997279 70.31844944 70.37389492
 68.2729267  71.96408475 75.20385073 72.66771121 72.63202371 67.72692475
 67.70630599 71.68089245]


In [33]:
print(y_testing[:50])

0      65.0
2      59.9
4      59.2
5      58.8
10     57.3
11     57.0
14     55.3
20     76.6
22     76.1
23     75.3
25     74.2
26     73.5
27     73.0
29     73.3
31     72.6
32     75.6
34     75.3
40     73.8
41     73.4
46     71.4
50     51.1
52     51.0
56     48.2
57     47.7
62     45.7
63     45.3
65     76.2
66     76.1
67     75.9
70     75.4
82     76.0
83     75.9
86     75.6
87     75.4
89     75.2
93     74.1
96     74.8
99     74.4
101    73.5
105    72.9
106    73.0
107    73.0
146    72.2
147    71.9
149    71.1
159    66.6
163    74.9
166    74.6
167    74.5
171    73.8
Name: life_expectancy, dtype: float64
