# Import Necessary Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

# modelling Libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Getting the data

In [43]:
df = pd.read_csv('data/insurance.csv')

In [44]:
df.shape

(1338, 7)

In [45]:
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [48]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [5]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [7]:
ordinalEncoder = OrdinalEncoder()

In [8]:
df[['region', 'smoker', 'sex']]

Unnamed: 0,region,smoker,sex
0,southwest,yes,female
1,southeast,no,male
2,southeast,no,male
3,northwest,no,male
4,northwest,no,male
...,...,...,...
1333,northwest,no,male
1334,northeast,no,female
1335,southeast,no,female
1336,southwest,no,female


In [9]:
df[['region', 'smoker', 'sex']] = ordinalEncoder.fit_transform(df[['region', 'smoker', 'sex']]).astype('int')

In [10]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


In [11]:
X = df.drop(['charges'], axis =1)
y = df.charges.copy()

In [12]:
df.corr()['charges'].sort_values(ascending = False)

charges     1.000000
smoker      0.787251
age         0.299008
bmi         0.198341
children    0.067998
sex         0.057292
region     -0.006208
Name: charges, dtype: float64

In [13]:
X.shape, y.shape

((1338, 6), (1338,))

In [14]:
train, test = train_test_split(df, test_size = 0.2, random_state = 42)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
std_scaler = StandardScaler()

In [17]:
std_scaler.fit(X_train)

In [None]:
pickle.dump(std_scaler, open('models/standard_scaler.pkl', 'wb'))

In [18]:
X_train = std_scaler.transform(X_train)
X_test  = std_scaler.transform(X_test)

In [19]:
lr = LinearRegression()
ridge = Ridge(random_state = 42)
lasso = Lasso(random_state = 42)
enet = ElasticNet(random_state = 42)
tree_reg = DecisionTreeRegressor(random_state = 42)
forest_reg = RandomForestRegressor(random_state = 42)

In [20]:
def get_scores(y_pred, y_test):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return {
        'mae': mae,
        'mse': np.sqrt(mse),
        'r2': r2
    }
def cross_val_scores(scores):
    s = np.sqrt(-scores)
    return{
        'scores': s,
        'Mean': s.mean(),
        'Standard Deviation': scores.std()
    }

In [21]:
lr_val_score = cross_val_score(lr, X_train, y_train, scoring = 'neg_mean_squared_error', cv =10)
print(cross_val_scores(lr_val_score))

{'scores': array([6060.98885372, 6574.75014965, 5219.82631416, 5999.01186446,
       5830.1772097 , 6113.31062453, 7273.89106275, 6312.44941225,
       6176.02374016, 5637.90480279]), 'Mean': 6119.833403416722, 'Standard Deviation': 6547170.862043856}


In [22]:
ridge_val_score = cross_val_score(ridge, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 10)
print(cross_val_scores(ridge_val_score))

{'scores': array([6060.95530298, 6574.72492434, 5220.64575755, 5999.72717817,
       5831.95365003, 6110.85060545, 7271.94853908, 6311.88070832,
       6176.34961626, 5639.34161083]), 'Mean': 6119.837789302047, 'Standard Deviation': 6536313.383631487}


In [23]:
en_val_score = cross_val_score(enet, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 10)
print(cross_val_scores(en_val_score))

{'scores': array([6940.04136497, 7365.96423669, 6593.26924313, 7369.33286496,
       7256.93517177, 6194.22083245, 7536.47055745, 7011.12953295,
       7260.0437437 , 6926.18122113]), 'Mean': 7045.358876918783, 'Standard Deviation': 5319599.693071428}


In [24]:
tree_val_score = cross_val_score(tree_reg, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 5)
print(cross_val_scores(tree_val_score))

{'scores': array([6334.05623462, 6652.3474318 , 7026.51812857, 7346.78103044,
       5404.44926165]), 'Mean': 6552.830417415265, 'Standard Deviation': 8489926.471777828}


In [25]:
forest_val_score = cross_val_score(forest_reg, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 5)
print(cross_val_scores(forest_val_score))

{'scores': array([5087.02595263, 4034.75609906, 5119.50476399, 5453.41655098,
       4942.89382775]), 'Mean': 4927.519438880597, 'Standard Deviation': 4469007.7878497075}


In [27]:
param_grid = [
{'n_estimators': [3, 10, 30, 40, 45, 50, 55, 57, 60, 62, 65, 70, 72, 75, 80], 'max_features': [2, 3, 4, 6, 8, 10, 15, 20, 22, 25]},
{'bootstrap': [False], 'n_estimators': [3, 10, 30, 40, 45, 50, 55, 57, 60, 62, 65, 70, 72, 75, 80], 'max_features': [2, 3, 4, 6, 8, 10, 15, 20, 22, 25]},
]

In [28]:
forest_grid_search = GridSearchCV(forest_reg, param_grid, cv = 5, scoring = 'neg_mean_squared_error', return_train_score = True)

In [29]:
forest_grid_search.fit(X_train, y_train)

In [30]:
forest_grid_search.best_estimator_

In [31]:
forest_grid_search.best_params_

{'max_features': 4, 'n_estimators': 80}

In [32]:
feature_importances = forest_grid_search.best_estimator_.feature_importances_

In [33]:
feature_importances

array([0.13699331, 0.00769461, 0.19730436, 0.02034821, 0.62044744,
       0.01721207])

In [34]:
columns = df.drop('charges', axis = 1).columns
columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

In [35]:
sorted(zip(feature_importances, columns), reverse = True)

[(0.6204474409329407, 'smoker'),
 (0.19730435829580095, 'bmi'),
 (0.1369933140295561, 'age'),
 (0.02034820795169878, 'children'),
 (0.017212072629301065, 'region'),
 (0.007694606160702461, 'sex')]

In [36]:
final_model = forest_grid_search.best_estimator_

final_predictions = final_model.predict(X_test)
final_mse = get_scores(final_predictions, y_test)
final_mse

{'mae': 2453.384195658549, 'mse': 4437.615040238891, 'r2': 0.873155631893856}

In [37]:
# Save Model

filename = 'models/insurance_forest2_model.pkl'
pkl = pickle.dump(final_model, open(filename, 'wb'))

In [38]:
filename = 'models/insurance_forest2_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

In [39]:
loaded_model

In [40]:
# Load Model And Checkout

loaded_model = pickle.load(open(filename, 'rb'))
loaded_model_pred = loaded_model.predict(X_test)
get_scores(loaded_model_pred, y_test)

{'mae': 2453.384195658549, 'mse': 4437.615040238891, 'r2': 0.873155631893856}

In [None]:
# test.to_csv('data/test_data.csv')
# train.to_csv('data/train_data.csv')