In [2]:
import pandas as pd
from numpy import math
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pickle

In [144]:
df=pd.read_csv('insurance.csv')

In [145]:
df.head(5)

Unnamed: 0,age,sex,New_Sex,bmi,children,smoker,New_Smoker,region,New_Region,charges
0,19,female,0,27.9,0,yes,1,southwest,3,16884.924
1,18,male,1,33.77,1,no,0,southeast,2,1725.5523
2,28,male,1,33.0,3,no,0,southeast,2,4449.462
3,33,male,1,22.705,0,no,0,northwest,1,21984.47061
4,32,male,1,28.88,0,no,0,northwest,1,3866.8552


In [146]:
# While in excel,I already encoded sex,smoker and region to New_sex,New_smoker and New_Region respectively 
# so that computer  can understand for machine learning task in regression

In [147]:
# So drop uneccessary columns
df=df.drop(['sex','smoker','region'],axis=1)

In [148]:
df.shape

(1338, 7)

In [149]:
df

Unnamed: 0,age,New_Sex,bmi,children,New_Smoker,New_Region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


### Data Exploration

### Find correlation between variables


In [115]:
df.corr()

Unnamed: 0,age,New_Sex,bmi,children,New_Smoker,New_Region,charges
age,1.0,-0.020856,0.109272,0.042469,-0.025019,0.002127,0.299008
New_Sex,-0.020856,1.0,0.046371,0.017163,0.076185,0.004588,0.057292
bmi,0.109272,0.046371,1.0,0.012759,0.00375,0.157566,0.198341
children,0.042469,0.017163,0.012759,1.0,0.007673,0.016569,0.067998
New_Smoker,-0.025019,0.076185,0.00375,0.007673,1.0,-0.002181,0.787251
New_Region,0.002127,0.004588,0.157566,0.016569,-0.002181,1.0,-0.006208
charges,0.299008,0.057292,0.198341,0.067998,0.787251,-0.006208,1.0


In [104]:
y=df.iloc[:,6]
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [105]:
X=df.iloc[:,0:6]
X

Unnamed: 0,age,New_Sex,bmi,children,New_Smoker,New_Region
0,19,0,27.900,0,1,3
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.880,0,0,1
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1
1334,18,0,31.920,0,0,0
1335,18,0,36.850,0,0,2
1336,21,0,25.800,0,0,3


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [70]:
# Now lets normalize the datasets so as to enable all features have equal chance of contribution
# So scale X_train and X_test
scaling=StandardScaler()
X_train=scaling.fit_transform(X_train)
X_test=scaling.fit_transform(X_test)

In [71]:
X_train

array([[ 1.54446486, -1.02597835,  0.10318223, -0.91501097, -0.51298918,
        -0.42907066],
       [ 0.48187425,  0.97467943, -0.4908445 , -0.91501097, -0.51298918,
         1.36784271],
       [ 1.04858924, -1.02597835,  0.2266597 ,  1.56027883, -0.51298918,
         0.46938603],
       ...,
       [ 1.33194673,  0.97467943, -0.8954835 , -0.91501097, -0.51298918,
        -1.32752735],
       [-0.15568012, -1.02597835,  2.84638435,  0.73518223,  1.94935887,
         1.36784271],
       [ 1.11942861,  0.97467943, -0.10706319, -0.91501097, -0.51298918,
         1.36784271]])

In [34]:
# # After scaling transform back to a dataframe so that the indexes and features names can be retained
# scaled_features_df = pd.DataFrame(scaled_features, index=X.index, columns=X.columns)

In [72]:
# X=scaled_features_df
# X

In [73]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(936, 6) (936,)
(402, 6) (402,)


In [75]:
model = LinearRegression()  # define the linear regression model
model.fit(X_train, y_train)  # fit the data

LinearRegression()

In [76]:
print('The weight vector is:', model.coef_)
print()
print('The bias is:', model.intercept_)

The weight vector is: [3693.22419232   54.80557279 2064.85587799  514.32785602 9592.79601818
 -363.35933577]

The bias is: 13379.157306773504


In [77]:
y_pred = model.predict(X_test) # make predictions for the test data
print(y_pred)  # it returns us a `np.ndarray` object
print(y_pred.shape)

[ 9.17802505e+03  7.16305826e+03  3.77321932e+04  9.83623445e+03
  2.75099058e+04  1.07080009e+04  1.86393349e+02  1.73012249e+04
  1.21139960e+03  1.15968890e+04  2.86059370e+04  9.65551478e+03
  5.25566409e+03  3.90657608e+04  4.10260085e+04  3.77848973e+04
  1.52884276e+04  3.64876258e+04  9.27714747e+03  3.21429418e+04
  3.83654322e+03  1.01710828e+04  2.19877622e+03  6.97056930e+03
  1.15671685e+04  1.31596444e+04  1.45370625e+04  6.19052885e+03
  1.01759106e+04  2.18439855e+03  9.05060452e+03  1.33420390e+04
  4.41860310e+03  3.57477033e+03  4.36407552e+03  1.32017754e+04
  1.74746466e+03  8.74639049e+03  3.40711254e+04  3.32023583e+04
  3.66555274e+03  4.41397674e+03  1.41340507e+04  1.19044216e+04
  8.87542656e+03  1.23303850e+04  5.21600171e+03  2.99177914e+03
  3.61828234e+04  9.20565746e+03  1.60643152e+04  2.36572770e+03
  1.23851880e+04  1.54606678e+03  1.36999103e+04  1.28842468e+04
  4.52265595e+03  3.28094723e+04  1.33513131e+04  1.30797199e+04
  1.42573513e+04  1.03752

In [84]:
# Compute mse
math.sqrt(mean_squared_error(y_test,y_pred))

5815.818415654073

In [85]:
r2_score(y_test,y_pred)

0.7693169411699159

### below try to predict the price by specifying 6 features of the independent variables

In [95]:
print(model.predict([[5,6,27,60,10,5]]))

[212896.05527491]


### Second model

In [None]:
df=pd.read_csv('insurance.csv')

In [150]:
df.corr()

Unnamed: 0,age,New_Sex,bmi,children,New_Smoker,New_Region,charges
age,1.0,-0.020856,0.109272,0.042469,-0.025019,0.002127,0.299008
New_Sex,-0.020856,1.0,0.046371,0.017163,0.076185,0.004588,0.057292
bmi,0.109272,0.046371,1.0,0.012759,0.00375,0.157566,0.198341
children,0.042469,0.017163,0.012759,1.0,0.007673,0.016569,0.067998
New_Smoker,-0.025019,0.076185,0.00375,0.007673,1.0,-0.002181,0.787251
New_Region,0.002127,0.004588,0.157566,0.016569,-0.002181,1.0,-0.006208
charges,0.299008,0.057292,0.198341,0.067998,0.787251,-0.006208,1.0


So from above we can observe that there is a strong linear correlation between the charges(target) and New_smoker

In [151]:
y=df.iloc[:,6]
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [152]:
X=df.iloc[:,4]
X

0       1
1       0
2       0
3       0
4       0
       ..
1333    0
1334    0
1335    0
1336    0
1337    1
Name: New_Smoker, Length: 1338, dtype: int64

### Lets reshape X and y 


We have to reshape X and y because they are both single arrays

In [153]:
X=X.values.reshape(-1,1)
y=y.values.reshape(-1,1)

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [155]:
# Now lets normalize the datasets so as to enable all features have equal chance of contribution
# So scale X_train and X_test
scaling=StandardScaler()
X_train=scaling.fit_transform(X_train)
X_test=scaling.fit_transform(X_test)

In [156]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(936, 1) (936, 1)
(402, 1) (402, 1)


In [157]:
model = LinearRegression()  # define the linear regression model
model.fit(X_train, y_train)  # fit the data

LinearRegression()

In [158]:
print('The weight vector is:', model.coef_)
print()
print('The bias is:', model.intercept_)

The weight vector is: [[9427.649439]]

The bias is: [13379.15730677]


In [159]:
y_pred = model.predict(X_test) # make predictions for the test data
print(y_pred)  # it returns us a `np.ndarray` object
print(y_pred.shape)

[[ 8716.69096492]
 [ 8716.69096492]
 [32442.15260323]
 [ 8716.69096492]
 [32442.15260323]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [32442.15260323]
 [ 8716.69096492]
 [ 8716.69096492]
 [32442.15260323]
 [32442.15260323]
 [32442.15260323]
 [ 8716.69096492]
 [32442.15260323]
 [ 8716.69096492]
 [32442.15260323]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [32442.15260323]
 [32442.15260323]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [32442.15260323]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69096492]
 [ 8716.69

In [160]:
# Compute mse
math.sqrt(mean_squared_error(y_test,y_pred))

7169.787786759297

In [161]:
r2_score(y_test,y_pred)

0.6494042655991139

---

### Third model

In [29]:
df=pd.read_csv('insurance.csv')

In [30]:
df.corr()

Unnamed: 0,age,New_Sex,bmi,children,New_Smoker,New_Region,charges
age,1.0,-0.020856,0.109272,0.042469,-0.025019,0.002127,0.299008
New_Sex,-0.020856,1.0,0.046371,0.017163,0.076185,0.004588,0.057292
bmi,0.109272,0.046371,1.0,0.012759,0.00375,0.157566,0.198341
children,0.042469,0.017163,0.012759,1.0,0.007673,0.016569,0.067998
New_Smoker,-0.025019,0.076185,0.00375,0.007673,1.0,-0.002181,0.787251
New_Region,0.002127,0.004588,0.157566,0.016569,-0.002181,1.0,-0.006208
charges,0.299008,0.057292,0.198341,0.067998,0.787251,-0.006208,1.0


In [31]:
#y=df.iloc[:,6]
y=df.loc[:,'charges']
y


0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [32]:
X=df.loc[:,['bmi','New_Smoker']]
X

Unnamed: 0,bmi,New_Smoker
0,27.900,1
1,33.770,0
2,33.000,0
3,22.705,0
4,28.880,0
...,...,...
1333,30.970,0
1334,31.920,0
1335,36.850,0
1336,25.800,0


In [39]:
#Merge and X and y
df = pd.concat([X, y], axis=1)
# Next save as a csv
df.to_csv('insurance_1.csv')

In [38]:
df.head(2)

Unnamed: 0,bmi,New_Smoker,charges
0,27.9,1,16884.924
1,33.77,0,1725.5523


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
# Now lets normalize the datasets so as to enable all features have equal chance of contribution
# So scale X_train and X_test
scaling=StandardScaler()
X_train=scaling.fit_transform(X_train)
X_test=scaling.fit_transform(X_test)

In [9]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(936, 2) (936,)
(402, 2) (402,)


In [10]:
regressor = LinearRegression()  # define the linear regression model
regressor.fit(X_train, y_train)  # fit the data

LinearRegression()

In [11]:
print('The weight vector is:', regressor.coef_)
print()
print('The bias is:', regressor.intercept_)

The weight vector is: [5.80620397e-17 4.06116431e-01]

The bias is: 0.20833333333333334


In [12]:
y_pred = regressor.predict(X_test) # make predictions for the test data
print(y_pred)  # it returns us a `np.ndarray` object
print(y_pred.shape)

[0.00748749 0.00748749 1.02951317 0.00748749 1.02951317 0.00748749
 0.00748749 0.00748749 0.00748749 0.00748749 1.02951317 0.00748749
 0.00748749 1.02951317 1.02951317 1.02951317 0.00748749 1.02951317
 0.00748749 1.02951317 0.00748749 0.00748749 0.00748749 0.00748749
 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749
 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749
 0.00748749 0.00748749 1.02951317 1.02951317 0.00748749 0.00748749
 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749
 1.02951317 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749
 0.00748749 0.00748749 0.00748749 1.02951317 0.00748749 0.00748749
 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749
 1.02951317 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749
 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749 0.00748749
 0.00748749 0.00748749 0.00748749 1.02951317 0.00748749 1.02951317
 1.02951317 1.02951317 0.00748749 0.00748749 0.00748749 0.0074

In [13]:
# Compute mse
math.sqrt(mean_squared_error(y_test,y_pred))

0.014704327956826755

In [14]:
r2_score(y_test,y_pred)

0.9986306550851586

---

### Define the pickle

In [88]:
pickle.dump(regressor,open('model.pkl','wb'))

In [90]:
model=pickle.load(open('model.pkl','rb'))