In [1]:
# Data handing libraries
import pandas as pd
import numpy as np
# For model evaluation
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# For model training
from sklearn.linear_model import LinearRegression, RANSACRegressor, SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

Loading Data

In [2]:
# Read the data
train_data = pd.read_csv('Datasets/train.csv')
test_data = pd.read_csv('Datasets/test.csv')

In [3]:
train_data.head()

Unnamed: 0,life_expectancy,adult_mortality,bmi,hiv_aids,income_comp_res,schooling
0,73.9,92.0,64.7,0.1,0.804,13.9
1,82.1,61.0,64.2,0.1,0.891,15.9
2,63.8,247.0,18.9,1.3,0.464,10.2
3,65.2,24.0,39.6,1.1,0.622,11.7
4,64.0,221.0,38.1,0.1,0.0,10.2


In [5]:
# Separate target from features
y = train_data['life_expectancy']
features = train_data.drop(['life_expectancy'], axis=1)

# Remove target variable from test data
# y_testing = test_data['life_expectancy']
# test_data.drop(['life_expectancy'], axis=1, inplace=True)

# Preview features
features.head()
# test_data.head()

Unnamed: 0,adult_mortality,bmi,hiv_aids,income_comp_res,schooling
0,92.0,64.7,0.1,0.804,13.9
1,61.0,64.2,0.1,0.891,15.9
2,247.0,18.9,1.3,0.464,10.2
3,24.0,39.6,1.1,0.622,11.7
4,221.0,38.1,0.1,0.0,10.2


In [6]:
# Hold out cross-validation
X_train, X_valid, y_train, y_valid = train_test_split(features, y, random_state=0)

In [7]:
#Define the model (Multivariate) 
regr = LinearRegression()
# Train the model
model = regr.fit(X_train, y_train)

In [8]:
#Linear regression(LR) function coefficeints
model.coef_

array([-0.01911768,  0.09120046, -0.48649032,  6.90987896,  0.81824723])

In [9]:
#LR y-intercept
model.intercept_

55.14134296359811

In [10]:
#predict using validation data
preds_valid = model.predict(X_valid)

In [11]:
#find MSE of LR using validation data
print(mean_squared_error(y_valid, preds_valid, squared=False))

4.43824779486579


In [13]:
#predict testing data 
preds_test = model.predict(X_valid)

In [17]:
# predicting the accuracy score(R2) of LR
score=r2_score(y_valid, preds_test)
print('r2 score is ',score)

r2 score is  0.7569342343832728


In [18]:
print(preds_test[:50])

[73.13728869 70.63420719 66.25108465 72.35256202 73.81557421 73.05213374
 76.28535846 71.7980724  75.69364929 57.6145361  76.36914555 63.16019362
 76.87723515 79.02547305 72.87407427 79.87589882 68.30255045 59.61183439
 60.34703207 65.43780024 63.63675205 50.44950363 60.70874979 61.90540997
 60.07265912 77.26145175 73.54664184 73.09917746 63.75821533 57.47496285
 59.72809891 67.59247166 58.88992361 73.58681089 55.71723888 69.96403319
 67.10013608 60.54872129 58.94652592 62.99591432 60.14598687 63.32801009
 66.52905783 53.4451125  61.04596984 75.47865393 69.42547163 61.87310374
 70.77023616 66.92416753]


In [20]:
print(y_valid[:50])

768     76.000000
704     73.200000
726     68.300000
215     67.111465
1158    74.700000
813     72.900000
436     76.600000
1611    71.400000
148     77.200000
1423    56.900000
76      76.600000
1870    62.800000
1280    78.100000
1514    81.600000
1063    73.400000
1697    77.500000
353     69.800000
18      58.200000
828     59.400000
971     68.400000
317     62.800000
17      54.000000
1410    62.600000
1906    58.800000
1850    67.000000
572     79.200000
574     75.700000
303     73.500000
1534    66.000000
1511    55.000000
512     59.300000
1467    71.800000
156     69.000000
553     71.800000
1000    51.200000
1542    82.300000
80      66.000000
1263    63.000000
1899    57.000000
575     69.900000
1773    61.800000
279     55.700000
491     75.000000
264     57.000000
1425    68.000000
1936    73.700000
615     72.300000
1433    51.300000
666     73.000000
1849    65.400000
Name: life_expectancy, dtype: float64


In [21]:
# RANCANRegressor
ransac = RANSACRegressor()
# Train the model
model2 = ransac.fit(X_train, y_train)
#predict
preds_valid2 = model2.predict(X_valid)
#r2 value
model2.score(X_valid, y_valid)

0.7410843775648954

In [22]:
ransac.estimator_.coef_

array([-2.53420867e-02, -4.23534916e-03, -3.74764075e-01,  1.48111160e+01,
        6.73561999e-01])

In [23]:
# print MSE
print(mean_squared_error(y_valid, preds_valid2, squared=False))

4.580667580572665


In [24]:
# SGDRegressor
sgd = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3))
sgd.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor())])

In [27]:
sgd_preds_valid = sgd.predict(X_valid)

In [28]:
# print MSE
print(mean_squared_error(y_valid, sgd_preds_valid, squared=False))

4.436164238796246


In [29]:
#r2 value
sgd.score(X_valid, y_valid)

0.7571623975229055

In [30]:
#XGBRegressor
xgb_model = XGBRegressor(n_estimators=850, learning_rate=0.079)

In [31]:
#train xgb
xgb_model.fit(X_train, y_train,
              early_stopping_rounds=5,
              eval_set=[(X_valid, y_valid)],
              verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.079, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=850, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [32]:
#predit using validation dataset on xgb
xgb_preds_valid = xgb_model.predict(X_valid)

In [33]:
#calculate MSE
print(mean_squared_error(y_valid, xgb_preds_valid, squared=False))

2.2226905622309414


In [36]:
#r2 value
sgd.score(X_valid, y_valid)

0.7571623975229055