In [1]:
import pandas as pd

In [11]:
df = pd.read_csv("/data/insurance.csv")
df.head(10)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [24]:
from sklearn import model_selection, preprocessing, linear_model

In [70]:
# dependent variable/target varible/label
label = "charges"

# independepent variables/features/predictors
X = df.drop(columns=[label])

# vector for target variable
y = df[label]

# one hot encoding to conver categorical features into numeric
# drop_first: remove first categorical feature ... it is redundant
X = pd.get_dummies(X, drop_first=True)

# Creating trainnig and test sets
# test_size is 30% of the whole
# random_state: to reprduce the same combination of training and test records
X_train, X_test, y_train, y_test = model_selection.train_test_split(X.values, 
                                                                    
                                                                    y, test_size = 0.3
                                                        , random_state = 1)

print("size of training:", len(X_train)/len(X))

# We want to calcualate z score for each column 
scaler = preprocessing.StandardScaler()

# We calcualte mean and std dev for each column
scaler.fit(X_train)

# Calculating the z scores
# purpose of z scoring is make mean = 0 and std = 1 for each column
X_train_std = scaler.transform(X_train)

# Displaying the mean and standard deviation of the stadandarized features
pd.DataFrame(X_train_std).describe()

# Applying the same transformation on the test data
X_test_std = scaler.transform(X_test)

# building a regression model
est = linear_model.LinearRegression()
est.fit(X_train_std, y_train)

# prediction on training and test data
y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

size of training: 0.6995515695067265


In [71]:
summary = pd.DataFrame({"actual": y_test, "prediction": y_test_pred})
summary["residual"] = summary.prediction - summary.actual
summary

Unnamed: 0,actual,prediction,residual
559,1646.42970,4610.315541,2963.885841
1087,11353.22760,12887.893880,1534.666280
1020,8798.59300,12573.948752,3775.355752
460,10381.47870,13197.836626,2816.357926
802,2103.08000,629.337182,-1473.742818
298,38746.35510,32357.257584,-6389.097516
481,9304.70190,12853.778438,3549.076538
616,11658.11505,12273.662540,615.547490
763,3070.80870,3865.164045,794.355345
750,19539.24300,29904.111392,10364.868392


In [72]:
pd.DataFrame({"feature": X.columns, "coefficient": est.coef_})

Unnamed: 0,feature,coefficient
0,age,3528.982731
1,bmi,1961.655208
2,children,421.550164
3,gender_male,-141.35911
4,smoker_yes,9733.786883
5,region_northwest,-129.545886
6,region_southeast,-414.541483
7,region_southwest,-379.095344


In [65]:
est.intercept_

13276.698553898497

In [31]:
import numpy as np

In [33]:
mae = np.mean(np.abs(summary.residual))
mae

4139.932064766016

In [34]:
mse = np.mean(summary.residual ** 2)
mse

36761456.35201327

In [35]:
rmse = np.sqrt(mse)
rmse

6063.122656850451

In [36]:
y_train_mean = np.mean(y_train)

In [39]:
sst = np.sum((y_train_mean - y_train) ** 2) # this measure is on baseline
sse = np.sum((y_train_pred - y_train) ** 2) # this measure against the model
sse/sst

0.2454442507366839

In [40]:
r2 = 1 - sse/sst
r2

0.7545557492633161

In [42]:
y_test_mean = np.mean(y_test)
sst = np.sum((y_test_mean - y_test) ** 2) # this measure is on baseline
sse = np.sum((y_test_pred - y_test) ** 2) # this measure against the model
r2_test = 1 - sse/sst
r2_test

0.740598931692721

In [43]:
from sklearn import metrics

In [69]:
print("R2 training:", metrics.r2_score(y_train, y_train_pred))
print("R2 test:", metrics.r2_score(y_test, y_test_pred))

print("Rmse training:", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Rmse: testing", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

R2 training: 0.754558730164975
R2 test: 0.7406196610211806
Rmse training: 6039.56519596171
Rmse: testing 6062.8803930600625


In [52]:
# dependent variable/target varible/label
label = "charges"

# independepent variables/features/predictors
X = df.drop(columns=[label])

X["high_bmi"] = np.where(X.bmi>30, 1, 0)
X["high_age"] = np.where(X.age>50, 1, 0)

# vector for target variable
y = np.log(df[label])

# one hot encoding to conver categorical features into numeric
# drop_first: remove first categorical feature ... it is redundant
X = pd.get_dummies(X, drop_first=True)

# Creating trainnig and test sets
# test_size is 30% of the whole
# random_state: to reprduce the same combination of training and test records
X_train, X_test, y_train, y_test = model_selection.train_test_split(X.values, 
                                                                    
                                                                    y, test_size = 0.3
                                                        , random_state = 1)

print("size of training:", len(X_train)/len(X))

# We want to calcualate z score for each column 
scaler = preprocessing.StandardScaler()

# We calcualte mean and std dev for each column
scaler.fit(X_train)

# Calculating the z scores
# purpose of z scoring is make mean = 0 and std = 1 for each column
X_train_std = scaler.transform(X_train)

# Displaying the mean and standard deviation of the stadandarized features
pd.DataFrame(X_train_std).describe()

# Applying the same transformation on the test data
X_test_std = scaler.transform(X_test)

# building a regression model
est = linear_model.LinearRegression()
est.fit(X_train_std, y_train)

# prediction on training and test data
y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print("R2 training:", metrics.r2_score(y_train, y_train_pred))
print("R2 test:", metrics.r2_score(y_test, y_test_pred))

print("Rmse training:", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Rmse: testing", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

size of training: 0.6995515695067265
R2 training: 0.7597949596783904
R2 test: 0.7890454142619109
Rmse training: 0.4509134476136262
Rmse: testing 0.4212505908115039


In [73]:
# dependent variable/target varible/label
label = "charges"

# independepent variables/features/predictors
X = df.drop(columns=[label])

X["high_bmi"] = np.where(X.bmi>30, 1, 0)
X["high_age"] = np.where(X.age>50, 1, 0)

# vector for target variable
y = np.log(df[label])

# one hot encoding to conver categorical features into numeric
# drop_first: remove first categorical feature ... it is redundant
X = pd.get_dummies(X, drop_first=True)

# Creating trainnig and test sets
# test_size is 30% of the whole
# random_state: to reprduce the same combination of training and test records
X_train, X_test, y_train, y_test = model_selection.train_test_split(X.values, 
                                                                    
                                                                    y, test_size = 0.3
                                                        , random_state = 1)

print("size of training:", len(X_train)/len(X))

# We want to calcualate z score for each column 
scaler = preprocessing.StandardScaler()

poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False)


X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

scaler.fit(X_train_poly)
# Calculating the z scores
# purpose of z scoring is make mean = 0 and std = 1 for each column
X_train_std = scaler.transform(X_train_poly)


# Displaying the mean and standard deviation of the stadandarized features
pd.DataFrame(X_train_std).describe()

# Applying the same transformation on the test data
X_test_std = scaler.transform(X_test_poly)

# building a regression model
est = linear_model.LinearRegression()
est.fit(X_train_std, y_train)

# prediction on training and test data
y_train_pred = est.predict(X_train_std)
y_test_pred = est.predict(X_test_std)

print("R2 training:", metrics.r2_score(y_train, y_train_pred))
print("R2 test:", metrics.r2_score(y_test, y_test_pred))

print("Rmse training:", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Rmse: testing", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

size of training: 0.6995515695067265
R2 training: 0.8371423307076078
R2 test: 0.8747843201744885
Rmse training: 0.37128400233098263
Rmse: testing 0.32454535338893015


In [74]:
X_train_std.shape

(936, 65)

In [56]:
a = np.array([
    [1, 0, 1],
    [1, 2, 3],
    [-1, 2, 1],
    [2, 3, 4]
])
poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
poly.fit_transform(a)

array([[ 1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  1.,  2.,  3.,  4.,  6.,  9.],
       [-1.,  2.,  1.,  1., -2., -1.,  4.,  2.,  1.],
       [ 2.,  3.,  4.,  4.,  6.,  8.,  9., 12., 16.]])

In [75]:
from sklearn import pipeline

In [78]:
# dependent variable/target varible/label
label = "charges"

# independepent variables/features/predictors
X = df.drop(columns=[label])

X["high_bmi"] = np.where(X.bmi>30, 1, 0)
X["high_age"] = np.where(X.age>50, 1, 0)

# vector for target variable
y = np.log(df[label])

# one hot encoding to conver categorical features into numeric
# drop_first: remove first value of a categorical feature ... it is redundant
X = pd.get_dummies(X, drop_first=True)

# Creating trainnig and test sets
# test_size is 30% of the whole
# random_state: to reprduce the same combination of training and test records
X_train, X_test, y_train, y_test = model_selection.train_test_split(X.values, 
                                                                    
                                                                    y, test_size = 0.3
                                                        , random_state = 1)


pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])

pipe.fit(X_train, y_train)

# prediction on training and test data
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("R2 training:", metrics.r2_score(y_train, y_train_pred))
print("R2 test:", metrics.r2_score(y_test, y_test_pred))

print("Rmse training:", np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("Rmse: testing", np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

R2 training: 0.8371423307076078
R2 test: 0.8747843201744885
Rmse training: 0.37128400233098263
Rmse: testing 0.32454535338893015
