## Life Expectancy Dataset



### Load the dataset

In [1]:
import pandas as pd

dataset = pd.read_csv("/Users/danieltomaro/Documents/University/Machine_Learning/Data/Life.csv")
print("dataset length:", len(dataset))
dataset.head()

dataset length: 2928


Unnamed: 0,Life expectancy,Status,Adult mortality,Infant deaths,Alcohol,Percentage expenditure,Hepatitis B,Measles,BMI,Under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 1–19 years,Thinness 5–9 years,Income composition of resources,Schooling
0,65.0,Developing,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,59.9,Developing,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,59.9,Developing,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,59.5,Developing,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,59.2,Developing,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


### Preprocess the dataset
##### Check if there is any missing value in the dataset

In [2]:
# check if these is any missing value in the dataset
dataset.isna().sum()

Life expectancy                     0
Status                              0
Adult mortality                     0
Infant deaths                       0
Alcohol                             0
Percentage expenditure              0
Hepatitis B                        19
Measles                             0
BMI                                 0
Under-five deaths                   0
Polio                              19
Total expenditure                   0
Diphtheria                         19
HIV/AIDS                            0
GDP                                 0
Population                          0
Thinness  1–19 years                0
Thinness 5–9 years                  0
Income composition of resources     0
Schooling                           0
dtype: int64

##### Drop the rows which has missing values

In [3]:
# dealing with missing values, since we only have very small number of missing values in our dataset, we can just remove it for easy processing
dataset = dataset.dropna()
print("dataset length:", len(dataset))

dataset length: 2909


#### Dealing with duplicated rows


In [4]:
# check if these is any duplicated rows
dataset.duplicated().any()

False

##### Check variable data types

In [5]:
dataset.dtypes

Life expectancy                    float64
Status                              object
Adult mortality                    float64
Infant deaths                        int64
Alcohol                            float64
Percentage expenditure             float64
Hepatitis B                        float64
Measles                              int64
BMI                                float64
Under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
HIV/AIDS                           float64
GDP                                float64
Population                         float64
Thinness  1–19 years               float64
Thinness 5–9 years                 float64
Income composition of resources    float64
Schooling                          float64
dtype: object

##### Deal with categorical variables

For Status we can also use 0 and 1 to replace Devloping and Developed

<br> 1 = Devloped; 0 = Developing


In [6]:
dataset['Status'] = dataset['Status'].replace({'Developed': 1, 'Developing': 0})
dataset.head()

  dataset['Status'] = dataset['Status'].replace({'Developed': 1, 'Developing': 0})


Unnamed: 0,Life expectancy,Status,Adult mortality,Infant deaths,Alcohol,Percentage expenditure,Hepatitis B,Measles,BMI,Under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 1–19 years,Thinness 5–9 years,Income composition of resources,Schooling
0,65.0,0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,59.9,0,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,59.9,0,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,59.5,0,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,59.2,0,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


#### Get a summary of numerical columns

In [7]:
dataset.describe()

Unnamed: 0,Life expectancy,Status,Adult mortality,Infant deaths,Alcohol,Percentage expenditure,Hepatitis B,Measles,BMI,Under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 1–19 years,Thinness 5–9 years,Income composition of resources,Schooling
count,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0,2909.0
mean,69.285631,0.176006,164.091784,30.496391,4.652744,744.647851,79.343692,2443.713304,38.152114,42.282571,82.548298,5.933355,82.321416,1.744792,6561.63078,52000230.0,4.865761,4.896287,0.628241,12.021829
std,9.494097,0.38089,124.004307,118.489329,3.976931,1996.659244,24.510698,11521.750689,19.909082,161.209969,23.416674,2.393236,23.706644,5.099864,13391.60944,91197410.0,4.40982,4.498319,0.204345,3.214716
min,36.3,0.0,1.0,0.0,0.01,0.0,1.0,0.0,1.0,0.0,3.0,0.37,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,0.0
25%,63.3,0.0,73.0,0.0,1.11,5.140669,74.0,0.0,19.4,0.0,78.0,4.36,78.0,0.1,579.43485,421758.0,1.6,1.6,0.491,10.1
50%,72.1,0.0,144.0,3.0,4.0,67.333787,88.4,18.0,43.0,4.0,93.0,5.9,93.0,0.1,1175.788981,3696958.0,3.4,3.4,0.677,12.3
75%,75.7,0.0,226.0,22.0,7.68,445.442337,96.0,372.0,56.1,27.0,97.0,7.33,97.0,0.8,4855.17564,65824160.0,7.2,7.3,0.775,14.2
max,89.0,1.0,723.0,1800.0,17.87,19479.91161,99.0,212183.0,77.6,2500.0,99.0,17.6,99.0,50.6,119172.7418,1293859000.0,27.7,28.6,0.948,20.7


##### Check dataset shape

In [8]:
dataset.shape

(2909, 20)

##### Define the input variables and the target variable
Input is everything except for life expectancy which is what we are measuring

In [9]:
array = dataset.values
X = array[:,1:20]
y = array[:,0]

### Split the dataset and normalize data

##### Split the training and testing dataset
10% of the data will be used for testing and 90% for training

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

##### Apply normalization on both train and testing dataset

In [11]:
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing data
X_test_norm = norm.transform(X_test)

### Train a model

#### Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train_norm, y_train)

test_score = model.score(X_test_norm, y_test)
print("R2 of LR:", test_score)

R2 of LR: 0.8323515207054643


### SVM Model


In [13]:
from sklearn.svm import SVR

model = SVR()

model.fit(X_train_norm, y_train)

test_score = model.score(X_test_norm, y_test)
print("R2 of SVM:", test_score)

R2 of SVM: 0.8673842661091744


####  Train the model based on training dataset with cross validation and then evaluate the model based on testing dataset
#####  Define a 5 fold cross validation with data shufflling and set the random state with 123

In [14]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=123) #set 5-fold cross validation after shuffle the dataset with random seed 123

##### 2) Run 5-fold cross validation and print the average r-squared score based on the cross validation results

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

#Basic training of the linear regression model
# define a LR model with default parameter setting
lr = LinearRegression()
# run the previously defined 5-fold validation on the dataset
results = cross_val_score(lr, X_train_norm, y_train, cv=kfold)
# print the averae r squared scores
print("Average R2 of LR:",results.mean())

Average R2 of LR: 0.8376013975750766


In [16]:
results

array([0.84017581, 0.8233233 , 0.87157675, 0.83764178, 0.81528934])

### The SVM Model


In [17]:
from sklearn.svm import SVR

svr = SVR()
results = cross_val_score(svr, X_train_norm, y_train, cv=kfold)
print("Average R2 of SVM:",results.mean())

Average R2 of SVM: 0.8569512784232334


In [18]:
results

array([0.85254191, 0.85455611, 0.8836527 , 0.85773855, 0.83626713])

### Optimize models with cross validatioin

In [19]:
# fine tune parameters for lr model
from sklearn.model_selection import GridSearchCV

grid_params_lr = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

lr = LinearRegression()
gs_lr_result = GridSearchCV(lr, grid_params_lr, cv=kfold).fit(X_train_norm, y_train)
print(gs_lr_result.best_score_)

0.8376013975750766


### SVM Model


In [20]:
# fine tune parameters for SVM model
grid_params_svr = {
    'kernel' : ('linear', 'rbf'),
    'C' : [1,5],
    'gamma' : ('auto','scale')
}
from sklearn.model_selection import RandomizedSearchCV

random_search_svr = RandomizedSearchCV(svr, grid_params_svr, n_iter=20, cv=kfold)
random_search_svr.fit(X_train_norm, y_train)
print(random_search_svr.best_score_)
gs_svr_result = GridSearchCV(svr, grid_params_svr, cv=kfold, n_jobs=-1).fit(X_train_norm, y_train)




0.8969641953863705


### Evaluate the trained Linear Regression model using testing dataset

In [21]:
# use the best model and evaluate on testing set
lr_test_R2 = gs_lr_result.best_estimator_.score(X_test_norm, y_test)
print("R2 of LR in testing:", lr_test_R2)

R2 of LR in testing: 0.8323515207054643


In [22]:
# check the parameter setting for the best selected model
gs_lr_result.best_params_

{'fit_intercept': True, 'positive': False}

### Evaluate the trained Support Vector Machine model using testing dataset

In [23]:
# use the best model and evaluate on testing set
svr_test_R2 = gs_svr_result.best_estimator_.score(X_test_norm, y_test)
print("R2 of SVM in testing:", svr_test_R2)

R2 of SVM in testing: 0.9128669443963928


In [24]:
# check the parameter setting for the best selected model
gs_svr_result.best_params_

{'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}

### Predict with a trained model

In [25]:
# predict with the first 5 data points
y_predict = gs_lr_result.best_estimator_.predict(X_test_norm[:5]) 
print(y_predict)

[64.95827501 82.21869871 65.07313691 79.79445358 54.2406662 ]


In [26]:
# predict with the first 5 data points
y_predict = gs_svr_result.best_estimator_.predict(X_test_norm[:5]) 
print(y_predict)

[62.54105202 81.63012209 64.39137987 79.94090667 54.82653629]


### Save and load a trained model

#### linear regression model

In [27]:
import pickle

# Save to file in the current working directory
pkl_filename = "lr_model.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(gs_lr_result.best_estimator_, file)

# Load from file
with open(pkl_filename, 'rb') as file:  
    pickle_model = pickle.load(file)

# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test_norm, y_test)  
print("R2 score:", score)  

R2 score: 0.8323515207054643


#### similarly for a svm model

In [28]:
import pickle

# Save to file in the current working directory
pkl_filename = "svm_model.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(gs_svr_result.best_estimator_, file)

# Load from file
with open(pkl_filename, 'rb') as file:  
    pickle_model = pickle.load(file)

# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test_norm, y_test)  
print("R2 score:", score)  

R2 score: 0.9128669443963928
