In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

insurance_data_path = 'insurance.csv'
df = pd.read_csv(insurance_data_path)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


In [144]:
df.sample(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
565,19.0,female,30.495,0.0,no,Northwest,2128.43105
309,,,,,no,,7749.1564
1105,54.0,female,31.24,0.0,no,Southeast,$10338.9316
153,42.0,F,23.37,0.0,yes,northeast,19964.7463
1023,18.0,male,23.32,1.0,no,Southeast,1711.0268


In [145]:
df.describe()

Unnamed: 0,age,bmi,children
count,1272.0,1272.0,1272.0
mean,35.214623,30.56055,0.948899
std,22.478251,6.095573,1.303532
min,-64.0,15.96,-4.0
25%,24.75,26.18,0.0
50%,38.0,30.21,1.0
75%,51.0,34.485,2.0
max,64.0,53.13,5.0


In [146]:
df.shape

(1338, 7)

In [147]:
df.isnull().sum()

age         66
sex         66
bmi         66
children    66
smoker      66
region      66
charges     54
dtype: int64

In [148]:
df.dropna(inplace=True)

In [149]:
df['sex'].value_counts()

male      495
female    479
M          61
woman      60
man        57
F          56
Name: sex, dtype: int64

In [150]:
df['sex'] = df['sex'].replace({'M':'male', 'man':'male', 'F':'female', 'woman':'female'})

In [151]:
df['region'].value_counts()

Southeast    164
southeast    158
southwest    157
Northwest    149
Northeast    149
northeast    146
northwest    145
Southwest    140
Name: region, dtype: int64

In [152]:
df['region'] = df['region'].str.lower()

In [153]:
df['charges'].value_counts()

1639.5631       2
19798.05455     1
7633.7206       1
4564.19145      1
8125.7845       1
               ..
12730.9996      1
7345.084        1
26109.32905     1
$28287.89766    1
29141.3603      1
Name: charges, Length: 1207, dtype: int64

In [154]:
df['charges'] = df['charges'].replace({'\$': ''}, regex=True).astype(float)

In [155]:
df = df[df['age']>0]

In [156]:
df.loc[df['children']<0, 'children'] = 0

In [157]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,4449.462
3,33.0,male,22.705,0.0,no,northwest,21984.47061
4,32.0,male,28.88,0.0,no,northwest,3866.8552
6,46.0,female,33.44,1.0,no,southeast,8240.5896
7,37.0,female,27.74,3.0,no,northwest,7281.5056
8,37.0,male,29.83,2.0,no,northeast,6406.4107
9,60.0,female,25.84,0.0,no,northwest,28923.13692
10,25.0,male,26.22,0.0,no,northeast,2721.3208


In [158]:
X = df.drop(columns=['charges'])
y = df['charges'].values

In [159]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19.0,female,27.900,0.0,yes,southwest
1,18.0,male,33.770,1.0,no,southeast
2,28.0,male,33.000,3.0,no,southeast
3,33.0,male,22.705,0.0,no,northwest
4,32.0,male,28.880,0.0,no,northwest
...,...,...,...,...,...,...
1332,52.0,female,44.700,3.0,no,southwest
1333,50.0,male,30.970,3.0,no,northwest
1335,18.0,female,36.850,0.0,no,southeast
1336,21.0,female,25.800,0.0,no,southwest


In [160]:
X = pd.get_dummies(X, columns=['sex', 'smoker', 'region'], drop_first=True)

In [161]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [162]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)

In [163]:
from sklearn.linear_model import LinearRegression
insurance_model = LinearRegression()

In [164]:
insurance_model.fit(X_train_scaled, y)

In [165]:
r2_score = np.mean(cross_val_score(insurance_model, X_train_scaled, y, cv=5, scoring='r2'))
print(r2_score)

0.7450511466263763


In [166]:
validation_data = pd.read_csv('validation_dataset.csv')

In [167]:
validation_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,18.0,female,24.09,1.0,no,southeast
1,39.0,male,26.41,0.0,yes,northeast
2,27.0,male,29.15,0.0,yes,southeast
3,71.0,male,65.502135,13.0,yes,southeast
4,28.0,male,38.06,0.0,no,southeast


In [168]:
validation_data_processed = pd.get_dummies(validation_data, columns=['sex', 'smoker', 'region'], drop_first=True)

In [169]:
validation_data_processed.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,18.0,24.09,1.0,0,0,0,1,0
1,39.0,26.41,0.0,1,1,0,0,0
2,27.0,29.15,0.0,1,1,0,1,0
3,71.0,65.502135,13.0,1,1,0,1,0
4,28.0,38.06,0.0,1,0,0,1,0


In [170]:
validation_data_scaled = scaler.transform(validation_data_processed)

In [171]:
validation_data_scaled

array([[-1.49780685, -1.06228573, -0.01460684, -1.01490657, -0.50977291,
        -0.57031528,  1.65242836, -0.56629467],
       [-0.01444691, -0.68328419, -0.85376971,  0.98531238,  1.96165779,
        -0.57031528, -0.60516996, -0.56629467],
       [-0.86208116, -0.23567029, -0.85376971,  0.98531238,  1.96165779,
        -0.57031528,  1.65242836, -0.56629467],
       [ 2.24591108,  5.70291392, 10.05534767,  0.98531238,  1.96165779,
        -0.57031528,  1.65242836, -0.56629467],
       [-0.79144497,  1.21989167, -0.85376971,  0.98531238, -0.50977291,
        -0.57031528,  1.65242836, -0.56629467],
       [ 2.17527489,  6.92098164,  8.37702192, -1.01490657,  1.96165779,
        -0.57031528,  1.65242836, -0.56629467],
       [-0.72080879,  0.2478834 ,  0.82455604, -1.01490657, -0.50977291,
         1.75341612, -0.60516996, -0.56629467],
       [ 0.19746165,  1.753271  , -0.01460684, -1.01490657, -0.50977291,
        -0.57031528, -0.60516996, -0.56629467],
       [ 0.62127877,  0.97729801

In [172]:
validation_predictions = insurance_model.predict(validation_data_scaled)

In [173]:
print(validation_predictions)

[ 5.08145323e+02  3.09475219e+04  2.79511577e+04  5.62912747e+04
  7.14781488e+03  5.79103386e+04  6.86674598e+03  1.32008289e+04
  1.25622278e+04  1.60103318e+04  2.47955042e+03  1.41312263e+04
  1.12910614e+04  1.17636998e+04  2.72753388e+03  3.88834298e+03
  4.21541033e+04  6.31746715e+04  5.90067368e+04  1.12519210e+04
 -3.74694268e+01  1.27956086e+04  3.22729435e+04  1.19103531e+04
  9.62603298e+03  5.20132736e+03  5.79979893e+04  3.20779862e+03
  1.16521876e+04  1.04440590e+04  6.33921457e+03  2.73393018e+04
  3.07973229e+04  1.31006769e+04  3.20955549e+04  1.38471222e+04
  5.81292369e+04  1.42619327e+04 -6.13811082e+01  2.93137915e+04
  3.00891218e+04  1.17745982e+04  3.74138961e+03  5.99121028e+04
  5.92043604e+03  3.98596861e+04  6.73235648e+04  3.10107650e+04
  1.51906124e+04  3.55183846e+04]


In [174]:
validation_data['predicted_charges'] = validation_predictions

In [175]:
validation_data.loc[validation_data['predicted_charges']<1000, 'predicted_charges'] = 1000

In [176]:
validation_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,predicted_charges
0,18.0,female,24.09,1.0,no,southeast,1000.0
1,39.0,male,26.41,0.0,yes,northeast,30947.521922
2,27.0,male,29.15,0.0,yes,southeast,27951.157717
3,71.0,male,65.502135,13.0,yes,southeast,56291.274683
4,28.0,male,38.06,0.0,no,southeast,7147.814884
