In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
# load dataset
df = pd.read_csv("C:/Users/Admin/Desktop/Work/Projects/LMS projects/insurance.csv)
df.head()

SyntaxError: unterminated string literal (detected at line 2) (1544838953.py, line 2)

In [None]:
df.info()

In [None]:
df.describe()

# EDA

In [None]:
# define a pie plot function
def pie_plot(column):
       fig , ax = plt.subplots()
       ax.pie(df[column].value_counts(),autopct="%0.2f%%",
              labels=df[column].value_counts().index)
       ax.set(title=f"Pie Chart of {column}");

In [None]:
# take a loop on categorical columns and plot the pie chart
columns = ["sex","smoker","region"]

for i in columns:
    pie_plot(i)

In [None]:
# plot the distribution of BMI index with histogram
sns.histplot(data=df,x="bmi",kde=True)
plt.title("Distribution of BMI")

In [None]:
sns.violinplot(data=df,y="bmi",x="sex",split=True)
sns.swarmplot(data=df,y="bmi",x="sex",color="w")

In [None]:
sns.kdeplot(df["age"],fill=True,color="m")
sns.rugplot(df["age"],height=0.05)

In [None]:
# plot the boxplot the 2features of the dataset(age and BMI)
fig , ax = plt.subplots(figsize=(10,6))
boxplot = ax.boxplot(df[["age", "bmi"]].values, patch_artist=True, notch=True)

colors = ["magenta", "blue"]
for box, color in zip(boxplot['boxes'], colors):
    box.set_facecolor(color)

for element in ['caps', 'whiskers', 'medians']:
    plt.setp(boxplot[element], color="black")

ax.set_xticks([1, 2], ["Age", "BMI"])
ax.set_title("Boxplots",size=18)

In [None]:
# define a function to remove the outliers with IQR methods

def detect_outlier(data,treshold=1.5):
    q1 = np.quantile(data,0.25)
    q3 = np.quantile(data,0.75)
    iqr = q3 - q1

    lower_bound = q1 - treshold * iqr
    upper_bound = q3 + treshold * iqr

    return lower_bound,upper_bound


In [None]:
# remove the BMI outlires with IQR methods and print the outlier
low,up = detect_outlier(df["bmi"])
print(df[(df["bmi"] < low) | (df["bmi"] > up)])
index = df[(df["bmi"] < low) | (df["bmi"] > up)].index
df.drop(index=index,inplace=True)
df.reset_index(drop=True,inplace=True)

In [None]:
# check the NAN values
df.isna().sum()

In [None]:
# check the duplicated
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
# map the specific values to each categorical
mapper_sex = {
    "male" : 1,
    "female" : 0
}

df["sex"] = df["sex"].map(mapper_sex)

In [None]:
mapper_smoker = {
    "yes":1,
    "no":0
}
df["smoker"] = df["smoker"].map(mapper_smoker)

In [None]:
mapper_region = {
    "southeast":0,
    "southwest":1,
    "northwest":2,
    "northeast":3
}
df["region"] = df["region"].map(mapper_region)

In [None]:
# check the dtypes and ensure to every features are numeric values
df.dtypes

In [None]:
# grouped data into smokers and get mean values on charges columns
fig , ax = plt.subplots()
df.groupby("smoker").mean()["charges"].plot(kind="bar",ax=ax,width=0.2)
ax.set(title="Smokers mean of charges",xticks=[0,1],
       xticklabels=["No","Yes"])
ax.grid(axis="y",ls="--",color="gray")

In [None]:
# plot heatmap and check the correlation between features
corr = df.corr()

fig , ax = plt.subplots(figsize=(10,7))
sns.heatmap(corr,annot=True,cmap="Blues",ax=ax)
ax.set_title("HeatMap",size=20,color="blue");

In [None]:
# plot the barplot on number of children
fig , ax = plt.subplots()
df["children"].value_counts().plot(kind="bar",ax=ax,colormap="plasma")
ax.grid(axis="y",ls="--",color="gray")
ax.set(ylabel="counts",title="Number of Children");

In [None]:
# plot features versus features
sns.pairplot(df,hue="smoker")

In [None]:
# determine the features(X) and target(y)
X = np.array(df.iloc[:,:-1])
y = np.array(df["charges"])

In [None]:
# split data to testset for evaluating data and train set for training the model
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
#scaling with method of Standard Scaling
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

In [None]:
# build a model of Linear Regression and evaluate the metrics
lin_reg = LinearRegression().fit(X_train_s,y_train)
pred_lin = lin_reg.predict(X_test_s)
mse_lin = mean_squared_error(y_test,pred_lin)
rmse_lin = np.sqrt(mse_lin)
mae_lin = mean_absolute_error(y_test,pred_lin)
r2_lin = r2_score(y_test,pred_lin)

In [None]:
# plot std of true values versus predicted values 
fig , ax = plt.subplots()
ax.scatter(y_test,pred_lin,c="blue")
ax.plot(y_test,y_test,ls="--",color="red")
ax.set(xlabel="Actual Values",ylabel="Predicted Values",
       title="Evaluating Lieanr Regression")
ax.grid(axis="both",ls="--",color="gray")
#ax.set(xlim=[0,65000],ylim=[0,65000])

In [None]:
# get number of values had predicted negative
(pred_lin < 0).sum()

- 8 values of test set  predicted less than 0 and they are negative

In [None]:
# print metrics of Linear Regression
print("mse for linear regression:",mse_lin.round(2))
print("mae for linear regression:",mae_lin.round(2))
print("rmse for linear regression:",rmse_lin.round(2))
print("r2-score for linear regression:",r2_lin.round(3))

In [None]:
fig , ax = plt.subplots()
x = np.arange(6)
ax.barh(x,lin_reg.coef_,color="m")
ax.set_yticks(x,df.columns[:-1])
ax.set(xlabel="Coefficients",title="Linear Regression");

In [None]:
grb_reg = GradientBoostingRegressor(n_estimators=350,learning_rate=0.05,random_state=0,max_depth=2)
grb_reg.fit(X_train_s,y_train)
pred_grb = grb_reg.predict(X_test_s)
mse_grb = mean_squared_error(y_test,pred_grb)
rmse_grb = np.sqrt(mse_grb)
mae_grb = mean_absolute_error(y_test,pred_grb)
r2_grb = r2_score(y_test,pred_grb)

In [None]:
fig , ax = plt.subplots()
ax.scatter(y_test,pred_grb,c="blue")
ax.plot(y_test,y_test,ls="--",color="red")
ax.set(xlabel="Actual Values",ylabel="Predicted Values",
       title="Evaluating Gradient Boosting Regression")
ax.grid(axis="both",ls="--",color="gray")
#ax.set(xlim=[0,65000],ylim=[0,65000])

In [None]:
(pred_grb < 0).sum()

- None of them were predicted negative

In [None]:
fig , ax = plt.subplots()
x = np.arange(6)
ax.barh(x,grb_reg.feature_importances_,color="m")
ax.set_yticks(x,df.columns[:-1])
ax.set(xlabel="Feature Importances",title="Gradient Boosting Regression");

In [None]:
print("mse for linear regression:",mse_grb.round(2))
print("mae for linear regression:",mae_grb.round(2))
print("rmse for linear regression:",rmse_grb.round(2))
print("r2-score for linear regression:",r2_grb.round(3))

In [None]:
rmse = [rmse_grb,rmse_lin]
r2 = [r2_grb,r2_lin]

x = [0,0.5]

fig , ax = plt.subplots(1,2,figsize=(10,4))
ax[0].bar(x,rmse,width=0.15,color=["blue","green"])
ax[0].set(xticks=[0,0.5],xticklabels=["Gradient","Linear"],
          ylabel="RMSE")
ax[1].bar(x,r2,width=0.15,color=["blue","green"])
ax[1].set(xticks=[0,0.5],xticklabels=["Gradient","Linear"],
          ylabel="r2 score")
fig.suptitle("Comparing the Linear and Ensembel models");