In [None]:
import numpy as np  # Importing NumPy library
import pandas as pd  # Importing Pandas library
import matplotlib.pyplot as plt  # Importing Matplotlib library's "pyplot" module
import seaborn as sns  # Imorting Seaborn library

import os


In [None]:
data = pd.read_csv("insurance.csv")

In [None]:
data.info()

In [None]:
data.head()


In [None]:
data.tail()

In [None]:
data.describe()

In [None]:
data.corr()

In [None]:
fig, axes = plt.subplots(figsize=(8, 8)) 
sns.heatmap(data=data.corr(), annot=True, linewidths=.5, ax=axes) 
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
data.plot(kind="hist", y="age", bins=70, color="b", ax=axes[0][0])
data.plot(kind="hist", y="bmi", bins=200, color="r", ax=axes[0][1])
data.plot(kind="hist", y="children", bins=5, color="g", ax=axes[1][0])
data.plot(kind="hist", y="charges", bins=200, color="orange", ax=axes[1][1])
plt.show()

In [None]:
sns.catplot(x="sex", kind="count", palette="Set1", data=data)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))
data.plot(kind='scatter', x='age', y='charges', alpha=0.5, color='green', ax=axes[0], title="Age vs. Charges")
data.plot(kind='scatter', x='bmi', y='charges', alpha=0.5, color='red', ax=axes[1], title="BMI vs. Charges")
data.plot(kind='scatter', x='children', y='charges', alpha=0.5, color='blue', ax=axes[2], title="Children vs. Charges")
plt.show()

In [None]:
sns.scatterplot(x="age", y="charges", data=data, palette='Set2', hue='smoker')

In [None]:
data.drop(["region"], axis=1, inplace=True)


In [None]:
data.info()

In [None]:
data.rename(columns={"age" : "AGE", "sex" : "SEX", "bmi" : "BMI", "children" : "CHILDREN", "smoker": "SMOKER", "charges" : "CHARGES"}, inplace=True)
data.columns

In [None]:
data.info()

In [None]:
X = data.drop(["CHARGES"], axis=1) 
y = data.CHARGES.values

In [None]:
X.SEX = [1 if each == "female" else 0 for each in X.SEX]
X.SMOKER = [1 if each == "yes" else 0 for each in X.SMOKER]
X.head()

In [None]:
X["BMI"] = (X["BMI"] - np.min(X["BMI"]))/(np.max(X["BMI"]) - np.min(X["BMI"]))
X.BMI

In [None]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

multiple_linear_reg = LinearRegression(fit_intercept=False)
multiple_linear_reg.fit(x_train, y_train)


In [None]:
from sklearn.model_selection import cross_val_predict  # For K-Fold Cross Validation
from sklearn.metrics import r2_score  # For find accuracy with R2 Score
from sklearn.metrics import mean_squared_error  # For MSE
from math import sqrt  # For squareroot operation

y_pred_MLR_train = multiple_linear_reg.predict(x_train)
y_pred_MLR_test = multiple_linear_reg.predict(x_test)

accuracy_MLR_train = r2_score(y_train, y_pred_MLR_train)
print("Training Accuracy for Multiple Linear Regression Model: ", accuracy_MLR_train)

accuracy_MLR_test = r2_score(y_test, y_pred_MLR_test)
print("Testing Accuracy for Multiple Linear Regression Model: ", accuracy_MLR_test)

RMSE_MLR_train = sqrt(mean_squared_error(y_train, y_pred_MLR_train))
print("RMSE for Training Data: ", RMSE_MLR_train)

RMSE_MLR_test = sqrt(mean_squared_error(y_test, y_pred_MLR_test))
print("RMSE for Testing Data: ", RMSE_MLR_test)



In [None]:
sns.scatterplot(y_train, y_pred_MLR_train)

In [None]:
multiple_linear_reg.coef_
