In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 30)
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('Life Expectancy Data.csv')
data = data.dropna()
data = data.sample(frac=1, random_state=0)
print(f'Initial Data shape: {data.shape}')

Initial Data shape: (1649, 22)


In [3]:
data.head(2)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
343,Botswana,2008,Developing,57.5,427.0,2,6.56,476.862587,94.0,0,34.2,3,96.0,5.55,96.0,12.7,5623.379566,1946351.0,8.8,8.6,0.646,12.1
934,France,2011,Developing,81.7,83.0,3,11.8,683.919057,74.0,14949,6.6,3,99.0,11.33,99.0,0.1,4381.288,65342776.0,0.6,0.6,0.882,16.1


In [4]:
# Deal with categorical values
le = preprocessing.LabelEncoder()
data['Country'] = le.fit_transform(data['Country'] )
data['Status'] = le.fit_transform(data['Status'] )
data = data.rename({'Life expectancy ': 'Life expectancy', 'Measles ': 'Measles', ' BMI ': 'BMI', 
                    'under-five deaths ': 'under-five deaths', ' HIV/AIDS': 'HIV/AIDS', 
                    ' thinness  1-19 years': 'thinness  1-19 years', ' thinness 5-9 years': 'thinness 5-9 years'}, axis=1)
data.head(2)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
343,16,2008,1,57.5,427.0,2,6.56,476.862587,94.0,0,34.2,3,96.0,5.55,96.0,12.7,5623.379566,1946351.0,8.8,8.6,0.646,12.1
934,43,2011,1,81.7,83.0,3,11.8,683.919057,74.0,14949,6.6,3,99.0,11.33,99.0,0.1,4381.288,65342776.0,0.6,0.6,0.882,16.1


In [5]:
# Train, validation and test split
train = data.iloc[:1400]
test = data.iloc[1400:]
X_test = test.drop(columns=['Life expectancy'])
y_test = test['Life expectancy']
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['Life expectancy']), train['Life expectancy'], test_size=0.20, random_state=42)
print(f'Train Dataset has {X_train.shape[0]} samples, Validation Data has {X_val.shape[0]} samples, Test Dataset has {X_test.shape[0]} samples.')
del data

Train Dataset has 1120 samples, Validation Data has 280 samples, Test Dataset has 249 samples.


In [25]:
params = {'objective':'regression', 'n_estimators': 100, 'max_depth': 3, 'random_state': 0, 'n_jobs': -1}
model = lgb.LGBMRegressor(**params)
model.fit(X_train, y_train, categorical_feature=['Country','Status'], eval_set=(X_val, y_val), early_stopping_rounds=5, feature_name=X_train.columns.name)

[1]	valid_0's l2: 66.188
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 55.7562
[3]	valid_0's l2: 47.2626
[4]	valid_0's l2: 40.0878
[5]	valid_0's l2: 34.1666
[6]	valid_0's l2: 29.3857
[7]	valid_0's l2: 25.308
[8]	valid_0's l2: 22.1062
[9]	valid_0's l2: 19.6241
[10]	valid_0's l2: 17.3496
[11]	valid_0's l2: 15.666
[12]	valid_0's l2: 13.9299
[13]	valid_0's l2: 12.6042
[14]	valid_0's l2: 11.5183
[15]	valid_0's l2: 10.6669
[16]	valid_0's l2: 9.87696
[17]	valid_0's l2: 9.3034
[18]	valid_0's l2: 8.71718
[19]	valid_0's l2: 8.24947
[20]	valid_0's l2: 7.81109
[21]	valid_0's l2: 7.48067
[22]	valid_0's l2: 7.22233
[23]	valid_0's l2: 6.9235
[24]	valid_0's l2: 6.70476
[25]	valid_0's l2: 6.48037
[26]	valid_0's l2: 6.29739
[27]	valid_0's l2: 6.1616
[28]	valid_0's l2: 5.99923
[29]	valid_0's l2: 5.89374
[30]	valid_0's l2: 5.79599
[31]	valid_0's l2: 5.69636
[32]	valid_0's l2: 5.63673
[33]	valid_0's l2: 5.57907
[34]	valid_0's l2: 5.50765
[35]	valid_0's l2: 5.45698
[36]	valid

New categorical_feature is ['Country', 'Status']


LGBMRegressor(max_depth=3, objective='regression', random_state=0)