# Basic Analysis of Data

In [1]:
#Importing library
import pandas as pd

In [2]:
#Loading data
data = pd.read_csv('C:\\Users\\bhavi\\OneDrive\\Desktop\\Data\\insurance.csv')

In [3]:
#Setting full view of the data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
#Checking shape of the data
data.shape

(1338, 7)

In [5]:
#View the data
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
#Checking correlation
data.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


# Checking NA 

In [7]:
#Checking missing values
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Creating dummy variables

In [8]:
#Creating dummy variables
data2 = pd.get_dummies(data)

In [9]:
#Checkiing dummy data
data2.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


# Splitting data into X & y

In [10]:
#Splitting data into X and y
X = data2.drop('charges', axis = 1)
y = data2['charges']

print('Shape of X:',X.shape)
print('Shape of y:',y.shape)

Shape of X: (1338, 11)
Shape of y: (1338,)


# Splitting into train & test

In [11]:
#Importing train and test library
from sklearn.model_selection import train_test_split

In [12]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 51)
print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of y_train:',y_train.shape)
print('Shape of y_test:',y_test.shape)

Shape of X_train: (1070, 11)
Shape of X_test: (268, 11)
Shape of y_train: (1070,)
Shape of y_test: (268,)


# Standard Scaler of the data

In [13]:
#Importing feature scaling library
from sklearn.preprocessing import StandardScaler

In [14]:
#Feature Scaling

#Calling the standard scaler function
sc = StandardScaler()

#Implement standard function
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [15]:
#Viewing X_train data
X_train

array([[-0.01679025, -1.10886921,  3.22869711, ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 1.46855153,  1.53991862, -0.90867523, ..., -0.57663083,
        -0.61696447,  1.79701113],
       [-0.93628753, -0.5478814 ,  1.57374817, ..., -0.57663083,
        -0.61696447, -0.55647958],
       ...,
       [-0.58263473,  0.08002838, -0.90867523, ..., -0.57663083,
        -0.61696447, -0.55647958],
       [-1.50213202,  0.18165062,  0.7462737 , ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 0.54905424,  0.90870405, -0.08120077, ..., -0.57663083,
         1.62083887, -0.55647958]])

# Linear Regression

In [16]:
#Importing linear regression library
from sklearn.linear_model import LinearRegression

In [17]:
#Calling regression model
lr = LinearRegression()

#Implementing regression model on X_train & y_train data
lr.fit(X_train, y_train)

LinearRegression()

# Checking Slop & Intercept

In [18]:
#Checking coefficient
lr.coef_

array([ 3.46538698e+03,  1.98956617e+03,  6.14837567e+02, -6.19898168e+15,
       -6.19898168e+15,  9.28026926e+15,  9.28026926e+15, -8.78024339e+16,
       -8.91807383e+16, -9.20914941e+16, -8.75646768e+16])

In [19]:
#Viewing intercept
lr.intercept_

13139.928199165144

# Prediction on the data

In [20]:
#Viewing first of the test data
X_test[0,:]

array([ 0.6197848 , -0.14552342, -0.90867523, -0.99813258,  0.99813258,
        0.50145986, -0.50145986, -0.55935984, -0.57663083,  1.62083887,
       -0.55647958])

In [21]:
X_test

array([[ 0.6197848 , -0.14552342, -0.90867523, ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 1.46855153,  0.36671877, -0.90867523, ..., -0.57663083,
        -0.61696447,  1.79701113],
       [-1.07774866, -0.47269746, -0.90867523, ..., -0.57663083,
         1.62083887, -0.55647958],
       ...,
       [-1.43140146, -0.05464174, -0.90867523, ..., -0.57663083,
         1.62083887, -0.55647958],
       [ 1.61001265,  0.23700583, -0.90867523, ..., -0.57663083,
        -0.61696447, -0.55647958],
       [ 1.18562929,  0.28409906,  1.57374817, ..., -0.57663083,
        -0.61696447, -0.55647958]])

In [22]:
#Predicting on x test
pred = lr.predict(X_test)
pred

array([ 8923.92819917, 36787.92819917,  2619.92819917, 11339.92819917,
       34091.92819917, 11595.92819917, 11435.92819917, 14915.92819917,
        5443.92819917, 10691.92819917,  9523.92819917, 12139.92819917,
        9899.92819917,  4107.92819917,  5675.92819917, 12843.92819917,
        5555.92819917,  4579.92819917, 25491.92819917, 28947.92819917,
       10227.92819917,  8571.92819917, 32707.92819917, 13531.92819917,
        6251.92819917, 16267.92819917, 10067.92819917,  2355.92819917,
       23627.92819917,  8555.92819917,  3795.92819917, 30587.92819917,
        5595.92819917,  4779.92819917,  7939.92819917, 11371.92819917,
       13283.92819917,  2291.92819917, 12203.92819917,  7827.92819917,
        9779.92819917,   939.92819917,  6091.92819917,  2187.92819917,
        4427.92819917, 15243.92819917, 15299.92819917, 34715.92819917,
        8523.92819917, 12715.92819917,  5587.92819917, 30611.92819917,
        7147.92819917, 40043.92819917,  4651.92819917, 27411.92819917,
      

# Checking Accuracy of the model

In [23]:
#Checking score
lr.score(X_test, y_test)

0.7481483247434315

# Accuracy Evaluation

In [24]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [25]:
#Computing mean square error
mse = mean_squared_error(y_test, pred)
mse

43792615.92444599

In [26]:
#Computing root mean square error
rmse = np.sqrt(mse)
rmse

6617.598954639514

In [27]:
#Checking manually
mse_manual = (sum((np.array(y_test) - lr.predict(X_test))**2))/len(np.array(y_test))
mse_manual

43792615.924445994

In [28]:
#Checking manually
rmse_manual = np.sqrt((sum((np.array(y_test) - lr.predict(X_test))**2))/len(np.array(y_test)))
rmse_manual

6617.598954639515

# R Square

In [29]:
from sklearn.metrics import r2_score

In [30]:
r2_score(y_test,pred)

0.7481483247434315