## IMPORT Regression Models

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder , OneHotEncoder , StandardScaler , MinMaxScaler ,  PolynomialFeatures
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import mean_squared_error , mean_absolute_error , mean_absolute_percentage_error , r2_score
from sklearn.linear_model import LinearRegression  , Lasso , Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor , VotingRegressor , StackingRegressor , BaggingRegressor , GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor


In [2]:
df = pd.read_csv("Life Expectancy Data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [4]:
df.head(10)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
5,Afghanistan,2010,Developing,58.8,279.0,74,0.01,79.679367,66.0,1989,...,66.0,9.2,66.0,0.1,553.32894,2883167.0,18.4,18.4,0.448,9.2
6,Afghanistan,2009,Developing,58.6,281.0,77,0.01,56.762217,63.0,2861,...,63.0,9.42,63.0,0.1,445.893298,284331.0,18.6,18.7,0.434,8.9
7,Afghanistan,2008,Developing,58.1,287.0,80,0.03,25.873925,64.0,1599,...,64.0,8.33,64.0,0.1,373.361116,2729431.0,18.8,18.9,0.433,8.7
8,Afghanistan,2007,Developing,57.5,295.0,82,0.02,10.910156,63.0,1141,...,63.0,6.73,63.0,0.1,369.835796,26616792.0,19.0,19.1,0.415,8.4
9,Afghanistan,2006,Developing,57.3,295.0,84,0.03,17.171518,64.0,1990,...,58.0,7.43,58.0,0.1,272.56377,2589345.0,19.2,19.3,0.405,8.1


In [5]:
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [6]:
# df["Life expectancy "].rename("Life expectancy" , inplace=True) ### wrong
df.rename(columns={"Life expectancy " : "Life expectancy"} , inplace=True)

In [7]:
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [8]:
def check (col):
    return col.strip()

df.columns = list(map(check , df.columns))

In [9]:
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years',
       'thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

In [10]:
df["Life expectancy"].describe()

count    2928.000000
mean       69.224932
std         9.523867
min        36.300000
25%        63.100000
50%        72.100000
75%        75.700000
max        89.000000
Name: Life expectancy, dtype: float64

In [11]:
df.corr(numeric_only=True)

Unnamed: 0,Year,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
Year,1.0,0.170033,-0.079052,-0.037415,-0.05299,0.0314,0.104333,-0.082493,0.108974,-0.042937,0.094158,0.09074,0.134337,-0.139741,0.10162,0.016969,-0.047876,-0.050929,0.243468,0.2094
Life expectancy,0.170033,1.0,-0.696359,-0.196557,0.404877,0.381864,0.256762,-0.157586,0.567694,-0.222529,0.465556,0.218086,0.479495,-0.556556,0.461455,-0.021538,-0.477183,-0.471584,0.724776,0.751975
Adult Mortality,-0.079052,-0.696359,1.0,0.078756,-0.195848,-0.24286,-0.162476,0.031176,-0.387017,0.094146,-0.274823,-0.115281,-0.275131,0.523821,-0.296049,-0.013647,0.302904,0.308457,-0.457626,-0.454612
infant deaths,-0.037415,-0.196557,0.078756,1.0,-0.115638,-0.085612,-0.223566,0.501128,-0.227279,0.996629,-0.170689,-0.128616,-0.175171,0.025231,-0.108427,0.556801,0.465711,0.47135,-0.145139,-0.19372
Alcohol,-0.05299,0.404877,-0.195848,-0.115638,1.0,0.341285,0.087549,-0.051827,0.330408,-0.11237,0.221734,0.296942,0.22202,-0.048845,0.354712,-0.035252,-0.428795,-0.417414,0.45004,0.547378
percentage expenditure,0.0314,0.381864,-0.24286,-0.085612,0.341285,1.0,0.016274,-0.056596,0.2287,-0.087852,0.147259,0.17442,0.143624,-0.097857,0.899373,-0.025662,-0.251369,-0.252905,0.381952,0.389687
Hepatitis B,0.104333,0.256762,-0.162476,-0.223566,0.087549,0.016274,1.0,-0.120529,0.15038,-0.233126,0.486171,0.05828,0.611495,-0.112675,0.083903,-0.123321,-0.120429,-0.12496,0.199549,0.231117
Measles,-0.082493,-0.157586,0.031176,0.501128,-0.051827,-0.056596,-0.120529,1.0,-0.175977,0.507809,-0.136166,-0.106241,-0.141882,0.030899,-0.076466,0.265966,0.224808,0.221072,-0.129568,-0.137225
BMI,0.108974,0.567694,-0.387017,-0.227279,0.330408,0.2287,0.15038,-0.175977,1.0,-0.237669,0.284569,0.242503,0.283147,-0.243717,0.301557,-0.072301,-0.532025,-0.538911,0.508774,0.546961
under-five deaths,-0.042937,-0.222529,0.094146,0.996629,-0.11237,-0.087852,-0.233126,0.507809,-0.237669,1.0,-0.18872,-0.130148,-0.195668,0.038062,-0.112081,0.544423,0.467789,0.472263,-0.163305,-0.209373


In [12]:
df.corr(numeric_only=True)["Life expectancy"]

Year                               0.170033
Life expectancy                    1.000000
Adult Mortality                   -0.696359
infant deaths                     -0.196557
Alcohol                            0.404877
percentage expenditure             0.381864
Hepatitis B                        0.256762
Measles                           -0.157586
BMI                                0.567694
under-five deaths                 -0.222529
Polio                              0.465556
Total expenditure                  0.218086
Diphtheria                         0.479495
HIV/AIDS                          -0.556556
GDP                                0.461455
Population                        -0.021538
thinness  1-19 years              -0.477183
thinness 5-9 years                -0.471584
Income composition of resources    0.724776
Schooling                          0.751975
Name: Life expectancy, dtype: float64

In [13]:
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness  1-19 years                34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10  BMI                              2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [15]:
df2 = df.dropna()

In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1649 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          1649 non-null   object 
 1   Year                             1649 non-null   int64  
 2   Status                           1649 non-null   object 
 3   Life expectancy                  1649 non-null   float64
 4   Adult Mortality                  1649 non-null   float64
 5   infant deaths                    1649 non-null   int64  
 6   Alcohol                          1649 non-null   float64
 7   percentage expenditure           1649 non-null   float64
 8   Hepatitis B                      1649 non-null   float64
 9   Measles                          1649 non-null   int64  
 10  BMI                              1649 non-null   float64
 11  under-five deaths                1649 non-null   int64  
 12  Polio                    

In [17]:
df3 = df.drop(columns=["Hepatitis B","GDP","Population"])

In [18]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Measles                          2938 non-null   int64  
 9   BMI                              2904 non-null   float64
 10  under-five deaths                2938 non-null   int64  
 11  Polio                            2919 non-null   float64
 12  Total expenditure   

In [19]:
df3.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Measles                              0
BMI                                 34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
thinness  1-19 years                34
thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

In [20]:
df3.dropna(inplace=True)

In [21]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2556 entries, 0 to 2937
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2556 non-null   object 
 1   Year                             2556 non-null   int64  
 2   Status                           2556 non-null   object 
 3   Life expectancy                  2556 non-null   float64
 4   Adult Mortality                  2556 non-null   float64
 5   infant deaths                    2556 non-null   int64  
 6   Alcohol                          2556 non-null   float64
 7   percentage expenditure           2556 non-null   float64
 8   Measles                          2556 non-null   int64  
 9   BMI                              2556 non-null   float64
 10  under-five deaths                2556 non-null   int64  
 11  Polio                            2556 non-null   float64
 12  Total expenditure        

In [22]:
df3.corr(numeric_only=True)["Life expectancy"]

Year                               0.163817
Life expectancy                    1.000000
Adult Mortality                   -0.677911
infant deaths                     -0.178457
Alcohol                            0.381245
percentage expenditure             0.414247
Measles                           -0.146643
BMI                                0.566911
under-five deaths                 -0.203460
Polio                              0.452269
Total expenditure                  0.189895
Diphtheria                         0.462185
HIV/AIDS                          -0.571707
thinness  1-19 years              -0.463537
thinness 5-9 years                -0.458098
Income composition of resources    0.717953
Schooling                          0.757445
Name: Life expectancy, dtype: float64

In [23]:
df["Country"].nunique()

193

In [24]:
df["Status"].nunique()

2

In [25]:
from sklearn.impute import SimpleImputer

In [26]:
col_obj = df.select_dtypes("object").columns
col_obj

Index(['Country', 'Status'], dtype='object')

In [27]:
col_num = df.select_dtypes("number").columns
col_num

Index(['Year', 'Life expectancy', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles', 'BMI',
       'under-five deaths', 'Polio', 'Total expenditure', 'Diphtheria',
       'HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years',
       'thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

In [28]:
imputer_obj = SimpleImputer(strategy="most_frequent")
imputer_num = SimpleImputer(strategy="median")

In [29]:
df[col_obj] = imputer_obj.fit_transform(df[col_obj])
df[col_num] = imputer_num.fit_transform(df[col_num])

In [30]:
df.isna().sum()

Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
BMI                                0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
HIV/AIDS                           0
GDP                                0
Population                         0
thinness  1-19 years               0
thinness 5-9 years                 0
Income composition of resources    0
Schooling                          0
dtype: int64

In [31]:
le = LabelEncoder()
df["Country"] = le.fit_transform(df["Country"])
df["Status"] = le.fit_transform(df["Status"])

In [38]:
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,0,2015.0,1,65.0,263.0,62.0,0.01,71.279624,65.0,1154.0,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,0,2014.0,1,59.9,271.0,64.0,0.01,73.523582,62.0,492.0,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,0,2013.0,1,59.9,268.0,66.0,0.01,73.219243,64.0,430.0,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,0,2012.0,1,59.5,272.0,69.0,0.01,78.184215,67.0,2787.0,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,0,2011.0,1,59.2,275.0,71.0,0.01,7.097109,68.0,3013.0,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [35]:
df.corr(numeric_only=True)["Life expectancy"]

Country                           -0.016402
Year                               0.170819
Status                            -0.481415
Life expectancy                    1.000000
Adult Mortality                   -0.696390
infant deaths                     -0.196769
Alcohol                            0.388918
percentage expenditure             0.381418
Hepatitis B                        0.170219
Measles                           -0.157767
BMI                                0.556901
under-five deaths                 -0.222738
Polio                              0.458399
Total expenditure                  0.208844
Diphtheria                         0.472211
HIV/AIDS                          -0.556703
GDP                                0.430461
Population                        -0.029014
thinness  1-19 years              -0.468002
thinness 5-9 years                -0.462473
Income composition of resources    0.688662
Schooling                          0.713054
Name: Life expectancy, dtype: fl

In [39]:
x = df.drop(columns=['Life expectancy'])
y = df["Life expectancy"]
x_train , x_test , y_train , y_test = train_test_split(x , y , train_size=.7 , random_state= 42)

In [40]:
LR = LinearRegression()
LR.fit(x_train , y_train)

In [41]:
y_predict = LR.predict(x_test)

In [42]:
error = mean_squared_error(y_test , y_predict)
error

16.586093851254212

In [43]:
rmse = np.sqrt(error)
rmse

4.072602834951404

In [44]:
mean_absolute_error(y_test , y_predict)

2.96347080008231

In [46]:
mean_absolute_percentage_error(y_test , y_predict)

0.0461582918621013

In [45]:
r2_score(y_test , y_predict)

0.8226063800019516

In [47]:
scaler = StandardScaler()

In [48]:
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [49]:
model2 = LinearRegression().fit(x_train_scaled , y_train)

In [50]:
y_scaled_predict = model2.predict(x_test_scaled)

In [52]:
error = mean_squared_error(y_test , y_scaled_predict)
print(error)
rmse = np.sqrt(error)
print(rmse)
print(mean_absolute_error(y_test , y_scaled_predict))
print(mean_absolute_percentage_error(y_test , y_scaled_predict))
print(r2_score(y_test , y_scaled_predict))

16.586093851250716
4.072602834950975
2.9634708000815753
0.046158291862093206
0.8226063800019889


In [53]:
np.abs(df.corr()["Life expectancy"])

Country                            0.016402
Year                               0.170819
Status                             0.481415
Life expectancy                    1.000000
Adult Mortality                    0.696390
infant deaths                      0.196769
Alcohol                            0.388918
percentage expenditure             0.381418
Hepatitis B                        0.170219
Measles                            0.157767
BMI                                0.556901
under-five deaths                  0.222738
Polio                              0.458399
Total expenditure                  0.208844
Diphtheria                         0.472211
HIV/AIDS                           0.556703
GDP                                0.430461
Population                         0.029014
thinness  1-19 years               0.468002
thinness 5-9 years                 0.462473
Income composition of resources    0.688662
Schooling                          0.713054
Name: Life expectancy, dtype: fl

In [54]:
model2.coef_   ## low values for low correlations

array([ 1.70106478e-01,  3.61445027e-03, -5.89935119e-01, -2.43633149e+00,
        1.12096254e+01,  2.49409186e-01,  1.79595032e-01, -3.93959882e-01,
       -3.00619329e-01,  8.33244136e-01, -1.13864642e+01,  5.37399175e-01,
        5.81996665e-02,  9.36658284e-01, -2.50553265e+00,  4.72417232e-01,
       -4.12735251e-02, -3.46891732e-01,  5.49872316e-02,  1.17361764e+00,
        2.20975898e+00])

In [55]:
Model3 = DecisionTreeRegressor()

In [56]:
Model3.fit(x_train,y_train)

In [57]:
y_predict3 = Model3.predict(x_test)

In [58]:
error = mean_squared_error(y_test , y_predict3)
print(error)
rmse = np.sqrt(error)
print(rmse)
print(mean_absolute_error(y_test , y_predict3))
print(mean_absolute_percentage_error(y_test , y_predict3))
print(r2_score(y_test , y_predict3))

8.10280045351474
2.8465418411670576
1.659297052154195
0.02562318356537209
0.9133379373430884
