# Basic Analysis of Data

In [45]:
#Importing library
import pandas as pd
import numpy as np

In [2]:
#Loading data
data = pd.read_csv('C:\\Users\\bhavi\\OneDrive\\Desktop\\Data\\insurance.csv')

In [3]:
#Checking shape of the data
data.shape

(1338, 7)

In [4]:
#View the data
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
#Checking correlation
data.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


# Checking NA 

In [6]:
#Checking missing values
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Creating dummy variables

In [7]:
#Creating dummy variables
data2 = pd.get_dummies(data)

In [8]:
#Checkiing dummy data
data2.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


# Splitting data into X & y

In [9]:
#Splitting data into X and y
X = data2.drop('charges', axis = 1)
y = data2['charges']

print('Shape of X:',X.shape)
print('Shape of y:',y.shape)

Shape of X: (1338, 11)
Shape of y: (1338,)


# Splitting into train & test

In [10]:
#Importing train and test library
from sklearn.model_selection import train_test_split

In [11]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 51)
print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of y_train:',y_train.shape)
print('Shape of y_test:',y_test.shape)

Shape of X_train: (1070, 11)
Shape of X_test: (268, 11)
Shape of y_train: (1070,)
Shape of y_test: (268,)


# Standard Scaler of the data

In [12]:
#Importing feature scaling library
from sklearn.preprocessing import StandardScaler

In [13]:
#Feature Scaling

#Calling the standard scaler function
sc = StandardScaler()

#Implement standard function
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [14]:
#Viewing X_train data
X_train

array([[-0.01679025, -1.10886921,  3.22869711, ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 1.46855153,  1.53991862, -0.90867523, ..., -0.57663083,
        -0.61696447,  1.79701113],
       [-0.93628753, -0.5478814 ,  1.57374817, ..., -0.57663083,
        -0.61696447, -0.55647958],
       ...,
       [-0.58263473,  0.08002838, -0.90867523, ..., -0.57663083,
        -0.61696447, -0.55647958],
       [-1.50213202,  0.18165062,  0.7462737 , ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 0.54905424,  0.90870405, -0.08120077, ..., -0.57663083,
         1.62083887, -0.55647958]])

# Ridge Regression

In [15]:
#Importing linear regression library
from sklearn.linear_model import Ridge, Lasso

In [16]:
#Calling regression model
rd = Ridge()

#Implementing ridge regression model on X_train & y_train data
rd.fit(X_train, y_train)

Ridge()

# Checking Slop & Intercept

In [17]:
#Checking coefficient
rd.coef_

array([ 3462.95199616,  1968.33260927,   604.07817204,    49.75745068,
         -49.75745068, -4757.77240211,  4757.77240211,   276.45468948,
         155.48563363,  -274.7568276 ,  -146.59965506])

In [19]:
#Viewing intercept
rd.intercept_

13128.763458176636

# Prediction on the data

In [20]:
#Viewing first of the test data
X_test[0,:]

array([ 0.6197848 , -0.14552342, -0.90867523, -0.99813258,  0.99813258,
        0.50145986, -0.50145986, -0.55935984, -0.57663083,  1.62083887,
       -0.55647958])

In [21]:
X_test

array([[ 0.6197848 , -0.14552342, -0.90867523, ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 1.46855153,  0.36671877, -0.90867523, ..., -0.57663083,
        -0.61696447,  1.79701113],
       [-1.07774866, -0.47269746, -0.90867523, ..., -0.57663083,
         1.62083887, -0.55647958],
       ...,
       [-1.43140146, -0.05464174, -0.90867523, ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 1.61001265,  0.23700583, -0.90867523, ..., -0.57663083,
        -0.61696447, -0.55647958],
       [ 1.18562929,  0.28409906,  1.57374817, ..., -0.57663083,
        -0.61696447, -0.55647958]])

In [22]:
#Predicting on x test
rd.predict(X_test)

array([ 8960.65401056, 36925.33603571,  2637.21995031, 11291.7912347 ,
       34028.49647231, 11519.46924538, 11363.99323091, 14788.37286458,
        5498.52460069, 10721.99280642,  9679.16353316, 12085.29970801,
        9838.06239475,  3997.66212977,  5644.74776479, 12804.21893869,
        5440.88401006,  4566.42278548, 25618.36060981, 29077.55597729,
       10355.52339291,  8467.05013092, 32543.21937731, 13414.26437738,
        6211.04156176, 16124.45665664, 10219.56292415,  2538.76734902,
       23587.04520785,  8603.60289119,  3981.22821354, 30444.3799025 ,
        5478.53395845,  4754.46214104,  7828.17029354, 11280.30779235,
       13239.82226179,  2225.06500376, 12100.21360392,  8057.59488655,
        9957.92430324,   941.06691127,  6050.92777883,  2219.74840451,
        4330.51045291, 15091.91583444, 15232.2046977 , 34646.34542386,
        8492.27515789, 12663.36688372,  5754.2005164 , 30758.11220041,
        7079.53094216, 39987.99378723,  4603.53146442, 27565.95491438,
      

# Checking Accuracy of the model

In [23]:
#Checking score
rd.score(X_test, y_test)

0.7482187261936969

# Lasso Regression

In [24]:
#Calling regression model
ls = Lasso()

#Implementing ridge regression model on X_train & y_train data
ls.fit(X_train, y_train)

Lasso()

# Checking Accuracy of the model

In [25]:
#Checking score
ls.score(X_test, y_test)

0.7482616365589023

# Ridge Regression with changing alpha

In [27]:
#Calling regression model
rd = Ridge(alpha = 2)

#Implementing ridge regression model on X_train & y_train data
rd.fit(X_train, y_train)

#Checking score
rd.score(X_test, y_test)

0.7481707305264254

In [40]:
#Using various alpha values and checking accuray
alpha = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

for i in alpha:
    rd = Ridge(alpha = i)
    rd.fit(X_train, y_train)
    print(rd.score(X_test, y_test).round(4))

0.7482
0.7482
0.7481
0.7481
0.748
0.748
0.7479
0.7479
0.7478
0.7478
0.7477
0.7477
0.7476
0.7476
0.7475
0.7475
0.7474
0.7474
0.7473
0.7472


In [49]:
#Using various alpha values and checking accuray
alpha = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

for i in alpha:
    ls = Lasso(alpha = i)
    ls.fit(X_train, y_train)
    print(ls.score(X_test, y_test).round(6))

0.748262
0.748256
0.748251
0.748246
0.74824
0.748234
0.748229
0.748223
0.748217
0.74821
0.748204
0.748197
0.748191
0.748184
0.748177
0.74817
0.748163
0.748156
0.748148
0.748141
