In [1]:
#pip install to lgboost
!pip install lightgbm



In [2]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
#reads dataset using pandas function
dataset = pd.read_csv('insurance_pre.csv')

In [4]:
dataset
#shows the input dataset

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [5]:
#Due to Nominal data present in dataset (i.e) State, Use get_dummies to split (One hot encoding)
#drop_first is used to eliminate 1st column of state as the result will be same even it is dropped and memory consuming is less
dataset = pd.get_dummies(dataset,dtype=int, drop_first = True)
#shows the input dataset
dataset

Unnamed: 0,Age,BMI,Children,Charges,Sex_male,Smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [6]:
dataset.columns
#for viewing all columns of dataset

Index(['Age', 'BMI', 'Children', 'Charges', 'Sex_male', 'Smoker_yes'], dtype='object')

In [6]:
indep=dataset[['Age', 'BMI', 'Children','Sex_male', 'Smoker_yes']]
dep=dataset['Charges']

In [7]:
#Using XGBoostregressor algorithm function
from sklearn.model_selection import GridSearchCV
import lightgbm
from lightgbm import LGBMRegressor
param_grid = {'boosting_type': ['gbdt','dart'],'n_estimators': [10,20,30],'num_leaves':[32,64]}
grid = GridSearchCV(LGBMRegressor(), param_grid, refit = True, verbose= 3,n_jobs=-1)

# fitting the model for grid search
grid.fit(indep, dep)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 1338, number of used features: 5
[LightGBM] [Info] Start training from score 13270.422260


In [8]:
# print best parameter after tuning
#print(grid.best_params_)
re=grid.cv_results_
print("The R_score value for best parameter {}:".format(grid.best_params_))

The R_score value for best parameter {'boosting_type': 'gbdt', 'n_estimators': 30, 'num_leaves': 64}:


In [9]:
table=pd.DataFrame.from_dict(re)

In [10]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_boosting_type,param_n_estimators,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.230321,0.116975,0.013431,0.006951,gbdt,10,32,"{'boosting_type': 'gbdt', 'n_estimators': 10, ...",0.777877,0.724922,0.762781,0.754488,0.750949,0.754203,0.017332,9
1,0.06279,0.009626,0.00625,0.005281,gbdt,10,64,"{'boosting_type': 'gbdt', 'n_estimators': 10, ...",0.779004,0.723502,0.763463,0.754134,0.750288,0.754078,0.018214,10
2,0.09185,0.003739,0.003542,0.004888,gbdt,20,32,"{'boosting_type': 'gbdt', 'n_estimators': 20, ...",0.86635,0.798105,0.869432,0.835054,0.84759,0.843306,0.025858,6
3,0.107184,0.008283,0.00214,0.002144,gbdt,20,64,"{'boosting_type': 'gbdt', 'n_estimators': 20, ...",0.866173,0.79805,0.871204,0.834461,0.846988,0.843375,0.026234,5
4,0.120712,0.01238,0.006521,0.006085,gbdt,30,32,"{'boosting_type': 'gbdt', 'n_estimators': 30, ...",0.873178,0.802397,0.884628,0.83947,0.858717,0.851678,0.028897,2
5,0.14713,0.002681,0.006216,0.006008,gbdt,30,64,"{'boosting_type': 'gbdt', 'n_estimators': 30, ...",0.872715,0.802236,0.886153,0.840729,0.858971,0.852161,0.029146,1
6,0.063065,0.009265,0.002783,0.001308,dart,10,32,"{'boosting_type': 'dart', 'n_estimators': 10, ...",0.738277,0.688701,0.719949,0.716108,0.709285,0.714464,0.016065,11
7,0.060223,0.003876,0.006992,0.005059,dart,10,64,"{'boosting_type': 'dart', 'n_estimators': 10, ...",0.7371,0.687212,0.720772,0.715369,0.70894,0.713879,0.016276,12
8,0.089928,0.002809,0.002767,0.003934,dart,20,32,"{'boosting_type': 'dart', 'n_estimators': 20, ...",0.856601,0.790371,0.855857,0.828315,0.835964,0.833422,0.024193,7
9,0.107065,0.005698,0.001503,0.002076,dart,20,64,"{'boosting_type': 'dart', 'n_estimators': 20, ...",0.85629,0.788501,0.855966,0.825668,0.836192,0.832523,0.024951,8


In [11]:
Age_input=float(input("Age:"))
BMI_input=float(input("BMI:"))
Children_input=float(input("Children:"))
Sex_male_input=int(input("Sex Male 0 or 1:"))
Smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age: 20
BMI: 20
Children: 2
Sex Male 0 or 1: 1
Smoker Yes 0 or 1: 1


In [12]:
Future_Prediction=grid.predict([[Age_input,BMI_input,Children_input,Sex_male_input,Smoker_yes_input]])
#change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[15459.85601438]
