In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score



In [2]:
df = pd.read_csv("/home/andrew/VS/glubinnaya-avtomatizaciya/lectures/insurance.csv")
print(df.info())
df.head(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [3]:
df["sex"] = df["sex"].apply(lambda x: 1 if x=="male" or x==1 else 0)
df["smoker"] = df["smoker"].apply(lambda x: 1 if x=="yes" or x==1 else 0)
df.head(15)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552
5,31,0,25.74,0,0,southeast,3756.6216
6,46,0,33.44,1,0,southeast,8240.5896
7,37,0,27.74,3,0,northwest,7281.5056
8,37,1,29.83,2,0,northeast,6406.4107
9,60,0,25.84,0,0,northwest,28923.13692


In [4]:
df_regions = pd.get_dummies(df.region, prefix="rg", dtype=int)
df_regions.head()

Unnamed: 0,rg_northeast,rg_northwest,rg_southeast,rg_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0


In [5]:
X = pd.concat([df.age, df.sex, df.bmi, df.children, df.smoker, df_regions], axis=1)
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,rg_northeast,rg_northwest,rg_southeast,rg_southwest
0,19,0,27.9,0,1,0,0,0,1
1,18,1,33.77,1,0,0,0,1,0
2,28,1,33.0,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.88,0,0,0,1,0,0


In [6]:
Y = df.charges
Y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=5)
X_test

Unnamed: 0,age,sex,bmi,children,smoker,rg_northeast,rg_northwest,rg_southeast,rg_southwest
471,18,0,30.115,0,0,1,0,0,0
1250,24,1,29.830,0,1,1,0,0,0
1257,54,0,27.645,1,0,0,1,0,0
139,22,0,36.000,0,0,0,0,0,1
919,35,0,34.210,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
825,64,0,31.825,2,0,1,0,0,0
1275,57,1,23.700,0,0,0,0,0,1
60,43,1,27.360,3,0,1,0,0,0
1215,18,1,39.140,0,0,1,0,0,0


In [13]:
def MSE(X,Y,vec):
    P = len(Y)
    ms = (1. / P) * np.sum((X.dot(vec) - Y) ** 2)
    return ms


def grad(X,Y,lr,iter):
    m = X.shape[0]
    X = np.hstack((np.ones((m, 1)), X)) 
    params = np.random.rand(X.shape[1])
    track = np.zeros((iter, 1))
    
    for i in range(iter):
        params = params - (2. / m * lr * np.dot(X.T, (np.dot(X, params) - Y)))
        track[i] = MSE(X, Y, params)
    
    return track, params


In [9]:
def pred(X, params):
    X = np.hstack((np.ones((X.shape[0], 1)), X))
    return np.dot(X, params)

In [14]:
track, weights = grad(X_train, Y_train, 0.0003, 100000)
# print(weights)

Y_pred = pred(X_test, weights)
# print(Y_pred)

print("R-squared score:", r2_score(Y_test, Y_pred))

R-squared score: 0.7603189828707034
