# Import Necessary Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

# modelling Libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Getting the data

In [5]:
df = pd.read_csv('data/insurance.csv')

In [6]:
df.shape

(1338, 7)

In [7]:
df.sample(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
523,38,female,37.73,0,no,southeast,5397.6167
11,62,female,26.29,0,yes,southeast,27808.7251
737,26,male,23.7,2,no,southwest,3484.331
461,42,male,30.0,0,yes,southwest,22144.032
1241,64,male,36.96,2,yes,southeast,49577.6624


In [8]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [14]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [17]:
ordinalEncoder = OrdinalEncoder()

In [22]:
region = np.array(df.region).reshape(df.shape[0], 1)

In [24]:
region

array([['southwest'],
       ['southeast'],
       ['southeast'],
       ...,
       ['southeast'],
       ['southwest'],
       ['northwest']], dtype=object)

In [30]:
df.region = ordinalEncoder.fit_transform(region).astype('int')

In [27]:
df = pd.get_dummies(df, columns = ['sex', 'smoker'])

In [31]:
df.sample()

Unnamed: 0,age,bmi,children,region,charges,sex_female,sex_male,smoker_no,smoker_yes
3,33,22.705,0,1,21984.47061,False,True,True,False


In [33]:
X = df.drop(['charges'], axis =1)
y = df.charges.copy()

In [92]:
df.corr()['charges'].sort_values(ascending = False)

charges       1.000000
smoker_yes    0.787251
age           0.299008
bmi           0.198341
children      0.067998
sex_male      0.057292
region       -0.006208
sex_female   -0.057292
smoker_no    -0.787251
Name: charges, dtype: float64

In [34]:
X.shape, y.shape

((1338, 8), (1338,))

In [104]:
train, test = train_test_split(df, test_size = 0.2, random_state = 42)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [40]:
std_scaler = StandardScaler()

In [41]:
std_scaler.fit(X_train)

In [42]:
X_train = std_scaler.transform(X_train)
X_test  = std_scaler.transform(X_test)

In [87]:
lr = LinearRegression()
ridge = Ridge(random_state = 42)
lasso = Lasso(random_state = 42)
enet = ElasticNet(random_state = 42)
tree_reg = DecisionTreeRegressor(random_state = 42)
forest_reg = RandomForestRegressor(random_state = 42)

In [45]:
def get_scores(y_pred, y_test):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return {
        'mae': mae,
        'mse': np.sqrt(mse),
        'r2': r2
    }
def cross_val_scores(scores):
    s = np.sqrt(-scores)
    return{
        'scores': s,
        'Mean': s.mean(),
        'Standard Deviation': scores.std()
    }

In [54]:
lr_val_score = cross_val_score(lr, X_train, y_train, scoring = 'neg_mean_squared_error', cv =10)
print(cross_val_scores(lr_val_score))

{'scores': array([6060.98885372, 6575.45295959, 5228.79989368, 5999.01186446,
       5827.68886011, 6099.42562673, 7273.89106275, 6312.44941225,
       6176.02374016, 5637.90480279]), 'Mean': 6119.163707623453, 'Standard Deviation': 6535785.280908808}


In [55]:
ridge_val_score = cross_val_score(ridge, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 10)
print(cross_val_scores(ridge_val_score))

{'scores': array([6060.80647195, 6574.56217457, 5220.49852613, 5999.62082842,
       5831.12680406, 6111.90480944, 7272.97981763, 6312.02152795,
       6176.09086554, 5638.68673895]), 'Mean': 6119.829856464383, 'Standard Deviation': 6541099.842795581}


In [72]:
en_val_score = cross_val_score(enet, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 10)
print(cross_val_scores(en_val_score))

{'scores': array([6444.50400723, 6911.30576456, 6000.90939071, 6713.16578406,
       6636.93761517, 5987.98327294, 7308.39881248, 6584.32339807,
       6704.33008843, 6375.36799317]), 'Mean': 6566.722612683897, 'Standard Deviation': 4973126.994739247}


In [67]:
tree_val_score = cross_val_score(tree_reg, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 5)
print(cross_val_scores(tree_val_score))

{'scores': array([6278.93632135, 6602.71263542, 7162.34237542, 7356.89711968,
       5433.38625394]), 'Mean': 6566.854941160331, 'Standard Deviation': 8782472.070020402}


In [66]:
forest_val_score = cross_val_score(forest_reg, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 5)
print(cross_val_scores(forest_val_score))

{'scores': array([5009.92440804, 3912.83829057, 5155.36415251, 5522.13448542,
       5016.60115366]), 'Mean': 4923.37249804002, 'Standard Deviation': 5009873.163249708}


In [None]:
ss

In [61]:
param_grid = [
{'n_estimators': [3, 10, 30, 40, 45, 50, 55, 57, 60, 62, 65, 70, 72, 75, 80], 'max_features': [2, 3, 4, 6, 8, 10, 15, 20, 22, 25]},
{'bootstrap': [False], 'n_estimators': [3, 10, 30, 40, 45, 50, 55, 57, 60, 62, 65, 70, 72, 75, 80], 'max_features': [2, 3, 4, 6, 8, 10, 15, 20, 22, 25]},
]

In [73]:
forest_grid_search = GridSearchCV(forest_reg, param_grid, cv = 5, scoring = 'neg_mean_squared_error', return_train_score = True)

In [77]:
forest_grid_search.fit(X_train, y_train)

In [78]:
forest_grid_search.best_estimator_

In [79]:
forest_grid_search.best_params_

{'max_features': 4, 'n_estimators': 62}

In [80]:
feature_importances = forest_grid_search.best_estimator_.feature_importances_

In [81]:
feature_importances

array([0.14277415, 0.18887781, 0.02258926, 0.01912117, 0.00481375,
       0.00504531, 0.3442906 , 0.27248795])

In [84]:
columns = df.drop('charges', axis = 1).columns
columns

Index(['age', 'bmi', 'children', 'region', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes'],
      dtype='object')

In [85]:
sorted(zip(feature_importances, columns), reverse = True)

[(0.3442906015841808, 'smoker_no'),
 (0.27248795210821325, 'smoker_yes'),
 (0.18887780972871557, 'bmi'),
 (0.14277414731552493, 'age'),
 (0.022589259794466198, 'children'),
 (0.01912116885235967, 'region'),
 (0.005045309076506776, 'sex_male'),
 (0.00481375154003273, 'sex_female')]

In [86]:
final_model = forest_grid_search.best_estimator_

final_predictions = final_model.predict(X_test)
final_mse = get_scores(final_predictions, y_test)
final_mse

{'mae': 2453.7426478522507, 'mse': 4531.27600756696, 'r2': 0.8677447341472161}

In [100]:
# Save Model

filename = 'models/insurance_forest_model.sav'
pkl = pickle.dump(final_model, open(filename, 'wb'))

In [3]:
filename = 'models/insurance_forest_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

In [4]:
loaded_model

In [101]:
# Load Model And Checkout

loaded_model = pickle.load(open(filename, 'rb'))
loaded_model_pred = loaded_model.predict(X_test)
get_scores(loaded_model_pred, y_test)

{'mae': 2453.7426478522507, 'mse': 4531.27600756696, 'r2': 0.8677447341472161}

In [105]:
# test.to_csv('data/test_data.csv')
# train.to_csv('data/train_data.csv')