In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score

# **Loading the dataset**

In [2]:
df = pd.read_csv("insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


charges is dependent variable

# **Data Cleaning**

either remove that row who have missing value (when data is large)

use this when data is max and minimum missing value  (df = df.dropna())

either fill 

if there is stock data then  df = df.fillna(method="ffill")  for filling previous

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df.isna().sum()

age         0
gender      0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

there is no missing values in this dataset

# **Dependent and independent variable**

In [6]:
X = df.iloc[:,:-1]
y = df['charges']

In [7]:
X.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region'], dtype='object')

# **Encoding** 

for categorical columns(ordinal or not ordinal)

ORDINAL = LABEL ENCODING

NON ORDINAL = ONEHOT ENCODING

In [8]:
# label encoder can be done on only single dimentional array
le = LabelEncoder()
X['gender'] = le.fit_transform(X['gender'])

lo = LabelEncoder()
X['smoker'] = lo.fit_transform(X['smoker'])

In [9]:
# array should be 2-D in one hot encoding
ct = ColumnTransformer(transformers=[("encode",OneHotEncoder(drop="first",sparse=False),[5])],remainder="passthrough")
X = ct.fit_transform(X)
X
# drop = first to avoid dummy variable trap (region southwest)
# sparse false to stop sparse compression



array([[ 0.  ,  0.  ,  1.  , ..., 27.9 ,  0.  ,  1.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.77,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  , ..., 33.  ,  3.  ,  0.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 36.85,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  1.  , ..., 25.8 ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 29.07,  0.  ,  1.  ]])

# **Scaling**

In [10]:
sc = StandardScaler()
X = sc.fit_transform(X)
X

array([[-0.56641788, -0.61132367,  1.76548098, ..., -0.45332   ,
        -0.90861367,  1.97058663],
       [-0.56641788,  1.63579466, -0.56641788, ...,  0.5096211 ,
        -0.07876719, -0.5074631 ],
       [-0.56641788,  1.63579466, -0.56641788, ...,  0.38330685,
         1.58092576, -0.5074631 ],
       ...,
       [-0.56641788,  1.63579466, -0.56641788, ...,  1.0148781 ,
        -0.90861367, -0.5074631 ],
       [-0.56641788, -0.61132367,  1.76548098, ..., -0.79781341,
        -0.90861367, -0.5074631 ],
       [ 1.76548098, -0.61132367, -0.56641788, ..., -0.26138796,
        -0.90861367,  1.97058663]])

# **Split**

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train)
print(y_train)

[[-0.56641788 -0.61132367  1.76548098 ...  0.56375578  2.41077224
   1.97058663]
 [-0.56641788  1.63579466 -0.56641788 ...  0.61789045 -0.90861367
  -0.5074631 ]
 [-0.56641788 -0.61132367 -0.56641788 ...  0.98535009  0.75107928
   1.97058663]
 ...
 [-0.56641788  1.63579466 -0.56641788 ... -0.91592544 -0.90861367
  -0.5074631 ]
 [ 1.76548098 -0.61132367 -0.56641788 ...  0.79833938 -0.90861367
  -0.5074631 ]
 [-0.56641788 -0.61132367  1.76548098 ... -1.99533811 -0.07876719
  -0.5074631 ]]
621     40182.24600
194      1137.46970
240     38511.62830
1168     4670.64000
1192    13019.16105
           ...     
763      3070.80870
835      7160.33030
1216     5415.66120
559      1646.42970
684      4766.02200
Name: charges, Length: 1070, dtype: float64


# **Training**

In [12]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

# **Prediction / Testing**

In [13]:
y_pred = regressor.predict(X_test)

In [14]:
# metrices to evaluate model
print(mean_absolute_error(y_test,y_pred))

3933.2726494052426


In [15]:
print(r2_score(y_test,y_pred))

0.7999876970680433


# **Support_Vector_Machine Training**

In [16]:
from sklearn.svm import SVR
regressor = SVR(C=5000,kernel="poly")
regressor.fit(X_train,y_train)

# **Testing after SVR**

In [17]:
y_pred = regressor.predict(X_test)
print(r2_score(y_test,y_pred))

0.8630433594602276


# **Decision Tree**

In [19]:

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth=10,min_samples_split=20)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

In [20]:
print(r2_score(y_test,y_pred))

0.8268571871961186
