In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('/content/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.shape

(1338, 7)

In [4]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
# three columns has categorical values
df['sex'] = pd.get_dummies(df['sex'],drop_first=True)
df['smoker'] = pd.get_dummies(df['smoker'],drop_first=True)
region = {
    'southwest' : 0, 'southeast' : 1, 'northwest' : 2, 'northeast' : 3
}

df['region'] = df['region'].map(region)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [6]:
x = df.drop('charges',axis=1)
y = df['charges']

In [7]:
scaler = StandardScaler()
scaling = scaler.fit_transform(x)
x = pd.DataFrame(scaling,columns = x.columns)
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,-1.438764,-1.010519,-0.45332,-0.908614,1.970587,-1.343905
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463,-0.438495
2,-0.797954,0.989591,0.383307,1.580926,-0.507463,-0.438495
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463,0.466915
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463,0.466915


In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.8, random_state = 42)

**1) Do Gradient descent or Regularization along regression and store the result**





In [9]:
# Lasso regression

model = Lasso()
model.fit(x_train,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [10]:
# accuracy

print(model.score(x_test,y_test))

0.7833294586868254


**2) Feature space transformation with PCA/LDA along regression and store the result**

In [11]:
# before pca

x.shape

(1338, 6)

In [12]:
# after pca

pca = PCA(n_components=2)
x = pca.fit_transform(x)
print(x.shape)


(1338, 2)


In [13]:
x_train1,x_test1,y_train1,y_test1 = train_test_split(x,y,train_size=0.8,random_state=42)

In [14]:
model1 = Lasso()
model1.fit(x_train1,y_train1)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [15]:
# accuracy
print(model1.score(x_test1,y_test1))

0.2442346208497631


**3) Simple regression without any augmenting method and store the result**

In [16]:
# Linear Regression

lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
print(lr.score(x_test,y_test))

0.7833463107364536
