EDA and prediciton of the cost to treat different patients based on various factors.

In [106]:
# Modules required for EDA

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('/content/insurance.csv')

In [None]:
# Basic format of the dataset

data.head()

In [None]:
# Missing value treatment

data.isnull().sum()

In [None]:
# Encoding for categorical features

from sklearn.preprocessing import LabelEncoder

#sex
le = LabelEncoder()
le.fit(data.sex.drop_duplicates())          # 0: Female, 1: Male
data.sex = le.transform(data.sex)

# smoker or not
le.fit(data.smoker.drop_duplicates())       # 0: No, 1: Yes
data.smoker = le.transform(data.smoker)

#region
le.fit(data.region.drop_duplicates())
data.region = le.transform(data.region)     # 0: NE, 1: NW, 2: SE, 3: SW
data.head()

In [None]:
# Seaborn heat map

f, ax = plt.subplots(figsize=(5, 4))
corr = data.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=bool), cmap='viridis_r', square=True, ax=ax)

**INFERENCES:**
1.   The biggest factor affecting the cost or charge is smoking.
2.   BMI isn't as big a factor as thought.

In [None]:
# Checking the above inference

data.corr()['charges'].sort_values()

In [None]:
# Looking at data for smokers

f= plt.figure(figsize=(9,5))
ax=f.add_subplot(121)
sns.histplot(data[(data.smoker == 1)]["charges"],color='c',ax=ax)
ax.set_title('Distribution of charges for smokers')
ax=f.add_subplot(122)
sns.histplot(data[(data.smoker == 0)]['charges'],color='b',ax=ax)
ax.set_title('Distribution of charges for non-smokers')

**INFERENCE:**

*   An average smoker pays much more charge than an average non smoker.

In [None]:
# Checking the above inference

mean_cost_for_smokers = data.groupby('smoker')['charges'].mean()
mean_cost_for_smokers

In [None]:
# Looking at data for BMI

plt.figure(figsize=(8,5))
ax = sns.scatterplot(x='bmi',y='charges',data=data,palette='magma',hue='smoker')
ax.set_title('Scatter plot of charges and bmi')

**Inference:**
*   BMI doesn't affect the cost of treatment for non smokers.
*   Cost of treatment for smokers suddenly increases when their BMI>30.

In [None]:
# Looking at data for Age

plt.figure(figsize=(8,5))
ax = sns.scatterplot(x='age',y='charges',data=data,palette='magma',hue='smoker')
ax.set_title('Scatter plot of charges and age')

**We can't draw any such inference about age as data has lots of outliers.**

In [116]:
# Modules required for Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, normalize
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Basic linear regression

x = data.drop(['charges'], axis = 1)
y = data['charges']

x_train,x_test,y_train,y_test = train_test_split(x, y, random_state=0, train_size=0.75)
lr = LinearRegression().fit(x_train,y_train)

y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)

print(f"Accuracy = {lr.score(x_test,y_test)}")

In [None]:
# Changing features for better accuracy

X = data.drop(['charges','region'], axis = 1)
Y = data.charges

quad = PolynomialFeatures (degree = 2)
x_quad = quad.fit_transform(X)

X_train,X_test,Y_train,Y_test = train_test_split(x_quad,Y, random_state = 0, train_size=0.75)

plr = LinearRegression().fit(X_train,Y_train)

Y_train_pred = plr.predict(X_train)
Y_test_pred = plr.predict(X_test)

print(f"Accuracy = {plr.score(X_train,Y_train)}")
print(f"Accuracy = {plr.score(X_test,Y_test)}")