# Insurance Cost Prediction
## 1) Import Libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## 2) Load Dataset

In [None]:
url="insurancedata_link\insurance.csv"
df=pd.read_csv(url)

## 3) Data Understanding

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info

## 4) Data Cleaning

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
df[df.duplicated(keep=False)]

In [None]:
df=df.drop_duplicates()
df.duplicated().sum()

## 5) EDA (Visualization)

In [None]:
df["charges"].describe
sns.histplot(df["charges"], kde=True)
plt.show()

In [None]:
df["charges"].skew()

In [None]:
sns.scatterplot(x="age", y="charges", data=df)
plt.show()

In [None]:
sns.boxplot(x="smoker", y="charges", data=df)
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.scatterplot(x="bmi", y="charges", hue="smoker", data=df )
plt.title("BMI vs Chareges (Smoker vs Non-Smoker)")
plt.show()

In [None]:
sns.boxplot(x="region", y="charges",data=df)
plt.show()

## 6) Model Training & Evaluation

In [None]:
x=df.drop("charges", axis=1)
y=df["charges"]

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=42)
categorical_columns= ['sex', 'smoker', 'region']
numerical_columns= ['age', 'bmi', 'children']

In [None]:
preprocessing_ =ColumnTransformer(
    transformers=[
        ("numericals", StandardScaler(), numerical_columns), 
        ("categoricals", OneHotEncoder(drop="first"), categorical_columns)
    ]
)

In [None]:

model=Pipeline(steps=[
    ("preprocessing_", preprocessing_),
    ("model", LinearRegression())
])

In [None]:
model.fit(x_train, y_train)


In [None]:
y_prediction= model.predict(x_test) 
MSE=mean_squared_error(y_test, y_prediction)
MAE=mean_absolute_error(y_test, y_prediction)
RMSE=np.sqrt(MSE)
R_square=r2_score(y_test, y_prediction)
print("Mean Squared Error :",MSE)
print("Mean Absolute Error :",MAE)
print("Root Mean Squared Error :",RMSE )
print("R Square of the model is :",R_square)

In [None]:
ridge_model=Pipeline(steps=[
    ("preprocessing_", preprocessing_),
    ("model", Ridge(alpha=1.0))
])
ridge_model.fit(x_train, y_train)
ridge_prediction=ridge_model.predict(x_test)
print("R square of Ridge regularization :", r2_score(y_test, ridge_prediction))

In [None]:
lasso_model=Pipeline(steps=[
    ("preprocessing_", preprocessing_),
    ("model", Lasso(alpha=0.01))
])
lasso_model.fit(x_train, y_train)
lasso_prediction=lasso_model.predict(x_test)
print("R square of Lasso regularization :", r2_score(y_test, lasso_prediction))

## Conclusion
The model was trained and evaluated using MAE / RMSE / R2 score.