In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
dataset = pd.get_dummies(dataset,drop_first = True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
independent = dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [6]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent,dependent,test_size=0.30,random_state=0) 

In [8]:
x_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
1163,18,28.215,0,0,0
196,39,32.800,0,0,0
438,52,46.750,5,0,0
183,44,26.410,0,0,0
1298,33,27.455,2,1,0
...,...,...,...,...,...
763,27,26.030,0,1,0
835,42,35.970,2,1,0
1216,40,25.080,0,1,0
559,19,35.530,0,1,0


In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train =sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [10]:
# Model creation / learning phase 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = {'criterion' :["squared_error","mae"],'max_features':["auto", "sqrt", "log2"],
                         'n_estimators':[10,100]}
grid = GridSearchCV(RandomForestRegressor(),param_grid, refit=True, verbose=3, n_jobs=1)
grid.fit(x_train,y_train) 

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END criterion=squared_error, max_features=auto, n_estimators=10;, score=0.856 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_features=auto, n_estimators=10;, score=0.764 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_features=auto, n_estimators=10;, score=0.789 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_features=auto, n_estimators=10;, score=0.787 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_features=auto, n_estimators=10;, score=0.753 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_features=auto, n_estimators=100;, score=0.861 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_features=auto, n_estimators=100;, score=0.767 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_features=auto, n_estimators=100;, score=0.813 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_features=auto, n_estimators=100;, score=0.802 

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=1,
             param_grid={'criterion': ['squared_error', 'mae'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [10, 100]},
             verbose=3)

In [11]:
# print best parameter after tuning
# print(grid.best_params_)
re = grid.cv_results_
grid_prediction = grid.predict(x_test)
# print classification report
from sklearn.metrics import r2_score
r_score = r2_score(y_test,grid_prediction)
print("The R Score Value for Best Parameter {}:".format(grid.best_params_),r_score)


The R Score Value for Best Parameter {'criterion': 'squared_error', 'max_features': 'sqrt', 'n_estimators': 100}: 0.8725245594622375


In [12]:
table = pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.013308,0.004627,0.0,0.0,squared_error,auto,10,"{'criterion': 'squared_error', 'max_features':...",0.856103,0.764492,0.788784,0.787463,0.753004,0.789969,0.035768,11
1,0.117994,0.008032,0.008381,0.00948,squared_error,auto,100,"{'criterion': 'squared_error', 'max_features':...",0.860634,0.767049,0.813433,0.801721,0.766339,0.801835,0.034819,5
2,0.009786,0.007177,0.0,0.0,squared_error,sqrt,10,"{'criterion': 'squared_error', 'max_features':...",0.845308,0.781403,0.784011,0.808171,0.755378,0.794855,0.030265,10
3,0.092233,0.004417,0.001593,0.003187,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.867261,0.7959,0.809419,0.82435,0.769569,0.8133,0.032447,1
4,0.009379,0.007658,0.003123,0.006247,squared_error,log2,10,"{'criterion': 'squared_error', 'max_features':...",0.848714,0.760056,0.805577,0.797416,0.766781,0.795709,0.031679,9
5,0.094273,0.012511,0.007879,0.006959,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.863631,0.794782,0.809495,0.826301,0.771313,0.813104,0.031056,2
6,0.064128,0.002346,0.001589,0.000795,mae,auto,10,"{'criterion': 'mae', 'max_features': 'auto', '...",0.836671,0.744952,0.784487,0.789517,0.731975,0.77752,0.03695,12
7,0.598167,0.008287,0.006626,0.008137,mae,auto,100,"{'criterion': 'mae', 'max_features': 'auto', '...",0.850482,0.781001,0.795583,0.799173,0.762617,0.797771,0.029333,7
8,0.034371,0.006256,0.003124,0.006249,mae,sqrt,10,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.831927,0.766802,0.797445,0.821044,0.764104,0.796265,0.02753,8
9,0.361508,0.008834,0.007906,0.00494,mae,sqrt,100,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.854951,0.790592,0.808583,0.82629,0.767609,0.809605,0.029869,3


In [13]:
age_input = float(input("Age :"))
bmi_input = float(input("BMI :"))
children_input = float(input("Children :"))
sex_male_input = int(input("Sex Male 0 or 1 :"))
smoker_yes_input = int(input("Smoker_yes 0 or 1 :"))

Age :25
BMI :30
Children :2
Sex Male 0 or 1 :1
Smoker_yes 0 or 1 :0


In [14]:
Future_Prediction = grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Future_Prediction {}:".format(Future_Prediction))

Future_Prediction [16459.0719802]:


In [15]:
#import pickle
#filename = "RF_GRID.sav" 

In [16]:
#pickle.dump(grid,open(filename,"wb"))