In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [25]:
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [27]:
df["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [28]:
df = pd.get_dummies(df, columns=["sex","smoker"],drop_first=True)

In [29]:
df = pd.get_dummies(df, columns=["region"])

In [30]:
df.head(3)

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,0,1
1,18,33.77,1,1725.5523,1,0,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,0,1,0


In [31]:
x = df.drop("charges",axis=1)
y = df["charges"]

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=5)

In [33]:
model = LinearRegression().fit(x_train,y_train)

In [34]:
model.score(x_test, y_test)

0.7570564375401182

In [35]:
model.coef_

array([  259.69206273,   335.25347982,   594.72285952,  -294.42233055,
       24061.71678252,   403.56146008,   435.69237053,  -436.77497683,
        -402.47885379])

In [36]:
model.intercept_

-12441.082226086255

In [37]:
y_pred = model.predict(x_test)

# MSE RMSE MAE MAPE

In [49]:
df_error = pd.DataFrame()
df_error["y_test"] = y_test
df_error["y_pred"] = y_pred
df_error.head(5)

Unnamed: 0,y_test,y_pred
471,2203.47185,2733.094908
1250,18648.4217,27962.994494
1257,11305.93455,11880.786841
139,2166.732,4938.789574
919,5245.2269,8275.109397


In [51]:
df_error["error"] = y_test-y_pred
df_error.head(3)

Unnamed: 0,y_test,y_pred,error
471,2203.47185,2733.094908,-529.623058
1250,18648.4217,27962.994494,-9314.572794
1257,11305.93455,11880.786841,-574.852291


**Squared Error**

In [53]:
df_error["squared_error"] = df_error["error"]**2
df_error.head(3)

Unnamed: 0,y_test,y_pred,error,squared_error
471,2203.47185,2733.094908,-529.623058,280500.6
1250,18648.4217,27962.994494,-9314.572794,86761270.0
1257,11305.93455,11880.786841,-574.852291,330455.2


**Absolute Error**

In [54]:
df_error["abs_error"] = np.abs(df_error["error"])
df_error.head(3)

Unnamed: 0,y_test,y_pred,error,squared_error,abs_error
471,2203.47185,2733.094908,-529.623058,280500.6,529.623058
1250,18648.4217,27962.994494,-9314.572794,86761270.0,9314.572794
1257,11305.93455,11880.786841,-574.852291,330455.2,574.852291


**Absolute Percentage Error**

In [56]:
df_error["percent_error"] = np.abs((y_test-y_pred)/y_test)
df_error.head(3)

Unnamed: 0,y_test,y_pred,error,squared_error,abs_error,percent_error
471,2203.47185,2733.094908,-529.623058,280500.6,529.623058,0.240358
1250,18648.4217,27962.994494,-9314.572794,86761270.0,9314.572794,0.499483
1257,11305.93455,11880.786841,-574.852291,330455.2,574.852291,0.050845


In [57]:
df_error.mean()

y_test           1.263697e+04
y_pred           1.319309e+04
error           -5.561171e+02
squared_error    3.454381e+07
abs_error        4.074552e+03
percent_error    4.417625e-01
dtype: float64

**Root Mean Square Error**

In [59]:
np.sqrt(df_error["squared_error"].mean())

5877.398541469592

## Error calculation with Sklearn

In [60]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

**MSE (Mean Squared Error)**

In [61]:
mean_squared_error(y_test, y_pred)

34543813.615268886

In [None]:
**MAE (Mean Absolute Error)**

In [62]:
mean_absolute_error(y_test, y_pred)

4074.551503309394

**MAPE (Mean Absolute Percentage Error)**

In [63]:
mean_absolute_percentage_error(y_test, y_pred)

0.4417625307589062