Predicting Medical Charges

In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv(r"C:\Imp\SamanyuK\Datasets\insurance.csv")
df

In [None]:
print(df.shape)
print(df.info())

In [None]:
df['sex'].unique()

In [None]:
df['sex']=df['sex'].apply(lambda x:1 if x=='female' else 0)
df['sex'].dtype

In [None]:
df['smoker'].unique()

In [None]:
df['smoker']=df['smoker'].apply(lambda x :0 if x=='no' else 1)
df['smoker'].dtype

In [None]:
df['region'].unique()

In [None]:
df1=pd.get_dummies(df['region'],prefix='region')
df=pd.concat([df,df1],axis=1)
df.head()

In [None]:
df.drop(columns='region',axis=1,inplace=True)
df['region_northeast']=df['region_northeast'].astype(int)
df['region_northwest']=df['region_northwest'].astype(int)
df['region_southeast']=df['region_southeast'].astype(int)
df['region_southwest']=df['region_southwest'].astype(int)
df.head()

In [None]:
df.columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(15,6))
sns.heatmap(df.corr(),annot=True)

In [None]:
plt.figure(figsize=(15,5))
plt.title("Distribution of cost for Smokers")
sns.histplot(df[df['smoker']==1]['charges'],color='#4B0082',kde=True)
plt.figure(figsize=(15,5))
plt.title("Distribution of cost for Non-Smokers")
sns.histplot(df[df['smoker']==0]['charges'],color='violet',kde=True)

In [None]:
plt.figure(figsize=(15,6))
sns.catplot(x=df['smoker'],kind='count',hue=df['sex'],palette='pink')
plt.title("Smoker and Non-Smoker count")

In [None]:
plt.figure(figsize=(15,6))
sns.histplot(x=df['charges'],hue=df['age'],palette='coolwarm',multiple='layer')

In [None]:
plt.figure(figsize=(15,6))
plt.title("Distribution of BMI")
sns.histplot(df['bmi'], kde=True, color='gold')

In [None]:
from sklearn.model_selection import train_test_split
x=df.drop(columns=['charges', 'region_northeast', 'region_northwest', 'region_southeast',
       'region_southwest'])
y=df['charges']
x_tr,x_te,y_tr,y_te=train_test_split(x,y,test_size=0.25,random_state=82)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression()) 
])
cv_scores = cross_val_score(pipeline, x_tr, y_tr, cv=5, scoring='r2')

print("Cross-validation R^2 scores:", cv_scores)
print("Mean R^2 score:", cv_scores.mean())

In [None]:
pipeline.fit(x_tr,y_tr)
pred=pipeline.predict(x_te)
feature_names = x_tr.columns
coefficients = pipeline.named_steps['regressor'].coef_
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
print(coef_df)
print('\n\n',pred)
from sklearn.metrics import r2_score,mean_squared_error
print(r2_score(y_te,pred))
print(mean_squared_error(y_te,pred))