### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Importing the Dataset

In [2]:
dataset = pd.read_csv('insurance.csv')
print(dataset)
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]


In [3]:
print(x)

[[19 'female' 27.9 0 'yes' 'southwest']
 [18 'male' 33.77 1 'no' 'southeast']
 [28 'male' 33.0 3 'no' 'southeast']
 ...
 [18 'female' 36.85 0 'no' 'southeast']
 [21 'female' 25.8 0 'no' 'southwest']
 [61 'female' 29.07 0 'yes' 'northwest']]


In [4]:
print(y)

[16884.924   1725.5523  4449.462  ...  1629.8335  2007.945  29141.3603]


### Creating dummy variables

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [1,4,5])],remainder = 'passthrough')
#creating the object for ColumnTransformer with OneHotEncoder for 2nd,4th and 5th column
x = np.array(ct.fit_transform(x))
# fitting and transform the initial data set and then converting it into a numpy array.

In [6]:
print(x)

[[1.0 0.0 0.0 ... 19 27.9 0]
 [0.0 1.0 1.0 ... 18 33.77 1]
 [0.0 1.0 1.0 ... 28 33.0 3]
 ...
 [1.0 0.0 1.0 ... 18 36.85 0]
 [1.0 0.0 1.0 ... 21 25.8 0]
 [1.0 0.0 0.0 ... 61 29.07 0]]


In [7]:
# avoiding dummy variable trap
x = x[:,1:]
print(x)
x = np.append(arr = np.ones((1338,1)).astype(int), values = x, axis = 1)
print('x after adding the intercept')
print(x)

[[0.0 0.0 1.0 ... 19 27.9 0]
 [1.0 1.0 0.0 ... 18 33.77 1]
 [1.0 1.0 0.0 ... 28 33.0 3]
 ...
 [0.0 1.0 0.0 ... 18 36.85 0]
 [0.0 1.0 0.0 ... 21 25.8 0]
 [0.0 0.0 1.0 ... 61 29.07 0]]
x after adding the intercept
[[1 0.0 0.0 ... 19 27.9 0]
 [1 1.0 1.0 ... 18 33.77 1]
 [1 1.0 1.0 ... 28 33.0 3]
 ...
 [1 0.0 1.0 ... 18 36.85 0]
 [1 0.0 1.0 ... 21 25.8 0]
 [1 0.0 0.0 ... 61 29.07 0]]


### Splitting the dataset inot Train set and Test set

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25 , random_state = 0)

### Applying the PCA model

In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components= 5) # since we do not know how many eigenvectors are need we keep the value of n_components = None 
# so that we can the eignvalues of all the evectors to figure out the best ones
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
explained_variance = pca.explained_variance_ratio_
print(explained_variance)
# after all the evalues are obtained select the number of evectors an replece 
# the value of n_components by that number

[0.82858134 0.15993773 0.00607105 0.00141108 0.0010495 ]


### Using PCA creating MLP

In [10]:
from sklearn.linear_model import LinearRegression
lR = LinearRegression()
lR.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
y_predi = lR.predict(x_test)
np.set_printoptions(precision = 2)
print(np.concatenate((y_predi.reshape(len(y_predi),1),y_test.reshape(len(y_test),1)),1))

[[ 1.36e+04  9.72e+03]
 [ 1.10e+04  8.55e+03]
 [ 3.43e+04  4.57e+04]
 [ 1.53e+04  1.30e+04]
 [ 4.20e+03  9.64e+03]
 [ 7.28e+03  4.50e+03]
 [ 3.96e+02  2.20e+03]
 [ 9.75e+03  1.14e+04]
 [ 5.06e+03  7.54e+03]
 [ 7.43e+03  5.43e+03]
 [ 4.18e+03  6.75e+03]
 [ 6.73e+03  1.05e+04]
 [ 7.34e+03  7.34e+03]
 [ 5.84e+03  4.19e+03]
 [ 2.53e+04  1.83e+04]
 [ 1.24e+04  1.07e+04]
 [ 1.19e+04  1.25e+04]
 [ 8.24e+03  3.49e+03]
 [ 8.33e+03  6.46e+03]
 [ 2.67e+04  3.35e+04]
 [ 3.15e+04  2.40e+04]
 [ 9.97e+03  1.26e+04]
 [ 1.00e+04  2.30e+04]
 [ 3.11e+04  2.31e+04]
 [ 9.16e+03  1.67e+03]
 [ 1.09e+04  4.67e+03]
 [ 8.78e+02  3.73e+03]
 [ 1.10e+04  7.68e+03]
 [ 5.66e+03  3.76e+03]
 [ 1.04e+04  8.41e+03]
 [ 5.22e+03  8.06e+03]
 [ 4.15e+04  4.90e+04]
 [ 1.73e+04  1.30e+04]
 [ 1.56e+04  2.06e+04]
 [ 2.15e+04  1.46e+04]
 [ 3.80e+03  4.14e+03]
 [ 1.72e+04  8.35e+03]
 [ 2.98e+04  5.12e+04]
 [ 2.66e+04  4.00e+04]
 [ 1.83e+03  1.88e+03]
 [ 3.49e+03  5.46e+03]
 [ 9.61e+03  2.87e+03]
 [ 3.06e+04  2.01e+04]
 [ 3.60e+04

In [12]:
# errors and accuracy
import sklearn.metrics as met
mse= met.mean_squared_error(y_test, y_predi)
print('MSE : ', mse)
rmse = np.sqrt(mse)
print('RMSE : ', rmse)
r2_c = met.r2_score(y_test, y_predi)
print('R-squarescore : ', r2_c)
mae = met.mean_absolute_error(y_test,y_predi)
print('MAE :', mae)

MSE :  41056008.41921837
RMSE :  6407.496267593011
R-squarescore :  0.7392184998608806
MAE : 4945.712958747364
