# Basic Analysis of Data

In [1]:
#Importing library
import pandas as pd

In [2]:
#Loading data
data = pd.read_csv('C:\\Users\\bhavi\\OneDrive\\Desktop\\Data\\insurance.csv')

In [3]:
#Setting full view of the data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
#Checking shape of the data
data.shape

(1338, 7)

In [5]:
#View the data
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
#Checking correlation
data.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


# Checking NA 

In [7]:
#Checking missing values
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Creating dummy variables

In [8]:
#Creating dummy variables
data2 = pd.get_dummies(data)

In [9]:
#Checkiing dummy data
data2.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


# Splitting data into X & y

In [10]:
#Splitting data into X and y
X = data2.drop('charges', axis = 1)
y = data2['charges']

print('Shape of X:',X.shape)
print('Shape of y:',y.shape)

Shape of X: (1338, 11)
Shape of y: (1338,)


# Splitting into train & test

In [11]:
#Importing train and test library
from sklearn.model_selection import train_test_split

In [12]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 51)
print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of y_train:',y_train.shape)
print('Shape of y_test:',y_test.shape)

Shape of X_train: (1070, 11)
Shape of X_test: (268, 11)
Shape of y_train: (1070,)
Shape of y_test: (268,)


# Standard Scaler of the data

In [13]:
#Importing feature scaling library
from sklearn.preprocessing import StandardScaler

In [14]:
#Feature Scaling

#Calling the standard scaler function
sc = StandardScaler()

#Implement standard function
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [15]:
#Viewing X_train data
X_train

array([[-0.01679025, -1.10886921,  3.22869711, ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 1.46855153,  1.53991862, -0.90867523, ..., -0.57663083,
        -0.61696447,  1.79701113],
       [-0.93628753, -0.5478814 ,  1.57374817, ..., -0.57663083,
        -0.61696447, -0.55647958],
       ...,
       [-0.58263473,  0.08002838, -0.90867523, ..., -0.57663083,
        -0.61696447, -0.55647958],
       [-1.50213202,  0.18165062,  0.7462737 , ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 0.54905424,  0.90870405, -0.08120077, ..., -0.57663083,
         1.62083887, -0.55647958]])

# Polynomial Regression

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [18]:
poly_reg = PolynomialFeatures(degree=2)
poly_reg.fit(X_train)
X_train_poly = poly_reg.transform(X_train)
X_test_poly = poly_reg.transform(X_test)

In [19]:
X_train_poly.shape, X_test_poly.shape

((1070, 78), (268, 78))

In [21]:
lr = LinearRegression()
lr.fit(X_train_poly, y_train)

LinearRegression()

In [22]:
lr.score(X_test_poly, y_test)

0.8481378246793845

In [24]:
lr.predict([X_test_poly[0,:]])

array([10169.])

In [27]:
y_pred = lr.predict(X_test_poly)
y_pred

array([10165., 41362.,  4171., 10399., 35207., 11407., 10039., 12851.,
        7675., 11961., 10034., 12467.,  9387.,  4329.,  5751., 13431.,
        4697.,  2315., 24594., 22850.,  3496.,  8723., 37239., 13311.,
        7687., 17631., 12630.,  3482.,  8791., 13371.,  2214., 28615.,
        4371.,  4451.,  9167., 10183.,  5719.,  3133., 12895.,  8278.,
       12450.,  2715.,  5871.,  3559.,  6954., 14685., 11003., 41531.,
        8951., 13775.,  6666., 37114.,  9263., 49169.,  6567., 33394.,
        9639., 11799., 14266.,  5967., 10307.,  8118., 26263.,  3427.,
        8687.,  9023., 11410.,  6100., 15483.,  7751., 31479., 16707.,
        5407.,  4751., 15907., 12087., 15607., 41691.,  4658., 10383.,
        5154., 14258.,  9006.,  2258.,  6023., 51647.,  4084., 11379.,
       12935.,  6891.,  5889., 12075.,  7271.,  6263.,  8733., 26083.,
       39258., 13519., 21487.,  9043.,  4271.,  3167., 43719.,  6904.,
        6314.,  9767.,  6994.,  9131., 10720., 42859.,  8187.,  7902.,
      

In [28]:
y_test

151      7789.63500
1146    52590.82939
1305     2464.61880
392      8964.06055
123     39556.49450
365      9778.34720
473     20878.78443
1151    12235.83920
1216     5415.66120
539     27346.04207
178      8823.27900
333     11658.37915
228      7358.17565
1147     2261.56880
80       4441.21315
115     30259.99556
174      2855.43755
399      1631.66830
126     17081.08000
982     19199.94400
1131     3693.42800
1164     7153.55390
298     38746.35510
1145    11289.10925
838      6402.29135
36      15612.19335
1035    12094.47800
855      1875.34400
250     12829.45510
812     11013.71190
723      1263.24900
144     20745.98910
113      2404.73380
439      2897.32350
1319     7201.70085
1013     8765.24900
660      6435.62370
349      1635.73365
1237    12224.35085
355     24603.04837
602     11070.53500
121      1705.62450
291     20277.80751
1283     1720.35370
832      4719.73655
1018    12495.29085
344     10977.20630
488     48885.13561
1185     8603.82340
342     13217.09450


In [30]:
from sklearn.metrics import mean_squared_error
import numpy as np
mse = mean_squared_error(y_test, y_pred)
print(mse)
rmse = np.sqrt(mse)
print(rmse)

26406184.951883297
5138.694868532602
