In [159]:
import pandas as pd

df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [160]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [161]:
# ETL 

# one-hot-encode sex
df['is_male'] = (df['sex'] == 'male').astype(int)
df['is_female'] = (df['sex'] == 'female').astype(int)
df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,is_male,is_female
0,19,female,27.9,0,yes,southwest,16884.924,0,1
1,18,male,33.77,1,no,southeast,1725.5523,1,0
2,28,male,33.0,3,no,southeast,4449.462,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0
4,32,male,28.88,0,no,northwest,3866.8552,1,0


In [162]:
# convert smoker col to binary
df['smoker'] = (df['smoker'] == 'yes').astype(int)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,is_male,is_female
0,19,female,27.9,0,1,southwest,16884.924,0,1
1,18,male,33.77,1,0,southeast,1725.5523,1,0
2,28,male,33.0,3,0,southeast,4449.462,1,0
3,33,male,22.705,0,0,northwest,21984.47061,1,0
4,32,male,28.88,0,0,northwest,3866.8552,1,0


In [163]:
# One-hot-encode region 
regions = df['region'].unique()
for region in regions:
    df[f'is_{region}'] = (df['region'] == region).astype(int)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,is_male,is_female,is_southwest,is_southeast,is_northwest,is_northeast
0,19,female,27.9,0,1,southwest,16884.924,0,1,1,0,0,0
1,18,male,33.77,1,0,southeast,1725.5523,1,0,0,1,0,0
2,28,male,33.0,3,0,southeast,4449.462,1,0,0,1,0,0
3,33,male,22.705,0,0,northwest,21984.47061,1,0,0,0,1,0
4,32,male,28.88,0,0,northwest,3866.8552,1,0,0,0,1,0


In [164]:
df = df.drop(['sex', 'bmi', 'region'], axis=1)
df.head()

Unnamed: 0,age,children,smoker,charges,is_male,is_female,is_southwest,is_southeast,is_northwest,is_northeast
0,19,0,1,16884.924,0,1,1,0,0,0
1,18,1,0,1725.5523,1,0,0,1,0,0
2,28,3,0,4449.462,1,0,0,1,0,0
3,33,0,0,21984.47061,1,0,0,0,1,0
4,32,0,0,3866.8552,1,0,0,0,1,0


In [165]:
# Shuffle Data
df = df.sample(frac=1)
df.head()

Unnamed: 0,age,children,smoker,charges,is_male,is_female,is_southwest,is_southeast,is_northwest,is_northeast
17,23,0,0,2395.17155,1,0,0,0,0,1
1090,47,0,1,41676.0811,1,0,0,1,0,0
687,40,0,0,5438.7491,1,0,0,1,0,0
56,58,2,0,13607.36875,0,1,0,0,0,1
149,19,1,0,1842.519,1,0,1,0,0,0


In [166]:
# Split into training data (2/3) and test data (1/3)
split_index = (len(df) * 2) // 3

training_df = df.iloc[:split_index, :]
validating_df = df.iloc[split_index:, :]

training_df.shape, validating_df.shape

((892, 10), (446, 10))

In [167]:
# Format as training data matrix
import numpy as np

# Target values
Y_tr = training_df['charges'].to_numpy()
Y_v = validating_df['charges'].to_numpy()

Y_tr.shape, Y_v.shape

((892,), (446,))

In [168]:
X_tr = training_df.drop('charges', axis=1).to_numpy()
X_v = validating_df.drop('charges', axis=1).to_numpy()

X_tr.shape, X_v.shape

((892, 9), (446, 9))

In [169]:
# Calculate our weight 
#W = np.linalg.pinv(X_tr.T @ X_tr @ (X_tr.T @ Y_tr))
W = np.linalg.pinv(X_tr) @ Y_tr
W

array([  269.92319717,   547.26352131, 22770.30175103, -1916.80930141,
       -1785.79211232, -1323.32870223,  -148.9521489 , -1399.01211024,
        -831.30845237])