In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
dataset = pd.get_dummies(dataset,drop_first = True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
independent = dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [6]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent,dependent,test_size=0.30,random_state=0) 

In [8]:
x_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
1163,18,28.215,0,0,0
196,39,32.800,0,0,0
438,52,46.750,5,0,0
183,44,26.410,0,0,0
1298,33,27.455,2,1,0
...,...,...,...,...,...
763,27,26.030,0,1,0
835,42,35.970,2,1,0
1216,40,25.080,0,1,0
559,19,35.530,0,1,0


In [9]:
# Model creation / learning phase 
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
param_grid = {'criterion' :["mse","mae","friedman_mse"],'max_features':["auto", "sqrt", "log2"],
                         'splitter':["best", "random"]}
grid = GridSearchCV(DecisionTreeRegressor(),param_grid, refit=True, verbose=3, n_jobs=1)
grid.fit(x_train,y_train) 

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END criterion=mse, max_features=auto, splitter=best;, score=0.729 total time=   0.0s
[CV 2/5] END criterion=mse, max_features=auto, splitter=best;, score=0.558 total time=   0.0s
[CV 3/5] END criterion=mse, max_features=auto, splitter=best;, score=0.763 total time=   0.0s
[CV 4/5] END criterion=mse, max_features=auto, splitter=best;, score=0.600 total time=   0.0s
[CV 5/5] END criterion=mse, max_features=auto, splitter=best;, score=0.649 total time=   0.0s
[CV 1/5] END criterion=mse, max_features=auto, splitter=random;, score=0.700 total time=   0.0s
[CV 2/5] END criterion=mse, max_features=auto, splitter=random;, score=0.677 total time=   0.0s
[CV 3/5] END criterion=mse, max_features=auto, splitter=random;, score=0.624 total time=   0.0s
[CV 4/5] END criterion=mse, max_features=auto, splitter=random;, score=0.619 total time=   0.0s
[CV 5/5] END criterion=mse, max_features=auto, splitter=random;, score=0.624 total ti

GridSearchCV(estimator=DecisionTreeRegressor(), n_jobs=1,
             param_grid={'criterion': ['mse', 'mae', 'friedman_mse'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             verbose=3)

In [10]:
# print best parameter after tuning
# print(grid.best_params_)
re = grid.cv_results_
grid_prediction = grid.predict(x_test)
# print classification report
from sklearn.metrics import r2_score
r_score = r2_score(y_test,grid_prediction)
print("The R Score Value for Best Parameter {}:".format(grid.best_params_),r_score)


The R Score Value for Best Parameter {'criterion': 'mae', 'max_features': 'auto', 'splitter': 'random'}: 0.7740393410944832


In [11]:
table = pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002802,0.001918,0.0014,0.001199,mse,auto,best,"{'criterion': 'mse', 'max_features': 'auto', '...",0.728697,0.55752,0.762642,0.600126,0.649429,0.659683,0.07675,3
1,0.00313,0.006259,0.003124,0.006249,mse,auto,random,"{'criterion': 'mse', 'max_features': 'auto', '...",0.699508,0.676889,0.624456,0.618819,0.62417,0.648768,0.033041,8
2,0.006249,0.007653,0.0,0.0,mse,sqrt,best,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.666413,0.539461,0.554642,0.635112,0.622129,0.603551,0.048564,15
3,0.001596,0.001954,0.006639,0.006576,mse,sqrt,random,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.705767,0.572359,0.582612,0.583205,0.53548,0.595884,0.057643,16
4,0.003989,2e-06,0.001816,0.000358,mse,log2,best,"{'criterion': 'mse', 'max_features': 'log2', '...",0.714914,0.6576,0.613756,0.606598,0.69643,0.65786,0.043165,4
5,0.00313,0.006259,0.0,0.0,mse,log2,random,"{'criterion': 'mse', 'max_features': 'log2', '...",0.644652,0.59109,0.499754,0.497301,0.626475,0.571854,0.062305,18
6,0.008919,0.007575,0.00766,0.006511,mae,auto,best,"{'criterion': 'mae', 'max_features': 'auto', '...",0.755026,0.598074,0.607141,0.529692,0.658875,0.629762,0.074925,11
7,0.012502,0.006251,0.0,0.0,mae,auto,random,"{'criterion': 'mae', 'max_features': 'auto', '...",0.587752,0.70352,0.736255,0.65356,0.681024,0.672422,0.05028,1
8,0.007287,0.008595,0.003523,0.006101,mae,sqrt,best,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.699306,0.636453,0.675247,0.643216,0.627551,0.656355,0.026833,6
9,0.007925,0.001961,0.002312,0.000392,mae,sqrt,random,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.680517,0.549436,0.672094,0.675642,0.47032,0.609602,0.085223,14


In [12]:
age_input = float(input("Age :"))
bmi_input = float(input("BMI :"))
children_input = float(input("Children :"))
sex_male_input = int(input("Sex Male 0 or 1 :"))
smoker_yes_input = int(input("Smoker_yes 0 or 1 :"))

Age :25
BMI :30
Children :2
Sex Male 0 or 1 :1
Smoker_yes 0 or 1 :0


In [13]:
Future_Prediction = grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Future_Prediction {}:".format(Future_Prediction))

Future_Prediction [3877.30425]:


In [18]:
#import pickle
#filename = "DT_GRID.sav" 

In [19]:
#pickle.dump(grid,open(filename,"wb"))