In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/data/insurance.csv")
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [24]:
target = "charges"
X = df.drop(columns=[target]) 
# Feature matrix, does not contain the target var

y = df[target] 
# label - dependent variable

In [27]:
X_dummy = pd.get_dummies(X, drop_first=True) 
# One hot encoding for categorical variables
X_dummy.head()

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


z_score = (x - x.mean())/x.std(), for every column x independently

Z_score gives 0 mean and 1 standard deviation for each column

For some algorithm, z-scoring the features is mandatory (KMean, PCA)
For some, it is recommended - gradient descent 
For some, it is not required (decision tree, random forest)

Main motivation for z-scoring is to speed up the computation during training.


In [7]:
import numpy as np

In [104]:
x = np.array([1000, 10000, 20000, 1000000])
x

array([   1000,   10000,   20000, 1000000])

In [105]:
np.mean(x)

257750.0

In [11]:
x - 4.6

array([-0.6, -2.6, -4.6,  3.4,  4.4])

In [12]:
(x - 4.6) ** 2

array([ 0.36,  6.76, 21.16, 11.56, 19.36])

In [14]:
var = ((x - 4.6) ** 2).mean()
var

11.84

In [16]:
std = np.sqrt(var)
std

3.4409301068170506

In [17]:
np.std(x)

3.4409301068170506

In [106]:
z = (x - x.mean())/np.std(x)
z

array([-0.59905607, -0.57805703, -0.55472475,  1.73183785])

In [107]:
np.mean(z)

-5.551115123125783e-17

In [108]:
np.std(z)

0.9999999999999999

In [None]:
sqrt(((x - x.mean()) ** 2).mean())

In [22]:
from sklearn import preprocessing

In [33]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_dummy)
X_std = pd.DataFrame(scaler.transform(X_dummy))
X_std.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,-1.853691e-16,-1.869041e-16,2.7216230000000002e-17,-7.567439e-17,2.142448e-16,-2.846087e-16,1.186561e-16,1.481127e-16
std,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374
min,-1.509965,-2.412011,-0.9086137,-1.010519,-0.5074631,-0.5664179,-0.6113237,-0.5664179
25%,-0.8691547,-0.7164063,-0.9086137,-1.010519,-0.5074631,-0.5664179,-0.6113237,-0.5664179
50%,-0.01474046,-0.0432088,-0.07876719,0.9895908,-0.5074631,-0.5664179,-0.6113237,-0.5664179
75%,0.8396738,0.6611572,0.7510793,0.9895908,-0.5074631,-0.5664179,1.635795,-0.5664179
max,1.765289,3.685522,3.240619,0.9895908,1.970587,1.765481,1.635795,1.765481


In [34]:
from sklearn import model_selection

In [109]:
# splitting the data into training and test sets. 
# training dataset - we build the model
# testing dataset - we validate or test the model performance
X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(X_dummy, y
                     , test_size = 0.3 # controls the size of test data ... it a number between 0 and 1
                     , random_state = 1 # reproduce the the randomness
                    )

In [71]:
X_train.shape

(936, 8)

In [72]:
X_test.shape

(402, 8)

In [74]:
936/len(X)

0.6995515695067265

In [70]:
np.random.seed(1)
np.random.random(5)

array([4.17022005e-01, 7.20324493e-01, 1.14374817e-04, 3.02332573e-01,
       1.46755891e-01])

In [75]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train) # calculates the mean and std for each column
X_train_std = scaler.transform(X_train) # calculates the z -score based 
X_test_std = scaler.transform(X_test) # calculates the z-score by re-using the mean and std of the training dataset

In [77]:
pd.DataFrame(X_train_std, columns=X_dummy.columns)

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0.797152,-0.702114,-0.904002,0.976766,-0.509664,1.788102,-0.600387,-0.587220
1,-1.271085,-0.703758,-0.085679,-1.023787,-0.509664,-0.559252,-0.600387,1.702939
2,-0.985811,-0.733340,-0.904002,0.976766,-0.509664,-0.559252,-0.600387,-0.587220
3,0.797152,-0.414510,1.550967,-1.023787,-0.509664,-0.559252,1.665591,-0.587220
4,0.868471,1.482037,-0.085679,0.976766,-0.509664,-0.559252,-0.600387,1.702939
5,-0.985811,1.749098,-0.904002,-1.023787,-0.509664,-0.559252,-0.600387,-0.587220
6,-0.843174,0.117970,-0.904002,-1.023787,1.962076,-0.559252,-0.600387,1.702939
7,1.724293,-0.608437,-0.904002,-1.023787,1.962076,1.788102,-0.600387,-0.587220
8,-1.485041,0.525546,-0.904002,-1.023787,-0.509664,-0.559252,1.665591,-0.587220
9,-1.485041,-1.248563,-0.904002,0.976766,-0.509664,-0.559252,-0.600387,-0.587220


In [78]:
from sklearn import linear_model

In [80]:
lr = linear_model.LinearRegression() # helps find the theta values 
lr.fit(X_train_std, y_train) # Number of theta depends on the features in X

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [81]:
lr.coef_, lr.intercept_

(array([3528.98273114, 1961.65520818,  421.55016415, -141.35911014,
        9733.78688309, -129.54588605, -414.54148288, -379.09534369]),
 13276.698553898505)

In [82]:
pd.DataFrame({"feature": X_dummy.columns
            , "coeffficient": lr.coef_})

Unnamed: 0,feature,coeffficient
0,age,3528.982731
1,bmi,1961.655208
2,children,421.550164
3,gender_male,-141.35911
4,smoker_yes,9733.786883
5,region_northwest,-129.545886
6,region_southeast,-414.541483
7,region_southwest,-379.095344


In [94]:
"y_hat = " + str(lr.intercept_) + " + " + " + ".join(["%f %s" % (theta, feature) 
          for feature, theta  in zip(X_dummy.columns, lr.coef_)])

'y_hat = 13276.698553898505 + 3528.982731 age + 1961.655208 bmi + 421.550164 children + -141.359110 gender_male + 9733.786883 smoker_yes + -129.545886 region_northwest + -414.541483 region_southeast + -379.095344 region_southwest'

In [95]:
lr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [97]:
y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)

In [100]:
result = pd.DataFrame({"actual": y_test, "prediction": y_test_pred})
result["error"] = result.actual - result.prediction
result.head()

Unnamed: 0,actual,prediction,error
559,1646.4297,4610.315541,-2963.885841
1087,11353.2276,12887.89388,-1534.66628
1020,8798.593,12573.948752,-3775.355752
460,10381.4787,13197.836626,-2816.357926
802,2103.08,629.337182,1473.742818


In [101]:
SSE = (result.error ** 2).sum() # Squared sum of errors
SSE

14778105453.509336

In [102]:
MSE = SSE / len(result)
MSE

36761456.352013275

In [103]:
RMSE = np.sqrt(MSE)
RMSE

6063.122656850452

In [110]:
SST = ((y_test - y_train.mean()) ** 2).sum()

In [113]:
print(SSE/SST)

"""
What is expected?
A. Positive
B. < 1 
""";

0.2594002695488974


In [114]:
r2 = 1 - SSE/SST # Expected value range between 0 to 1. 1 is the best value.
r2 # Goodness of the model

0.7405997304511026

In [115]:
from sklearn import metrics

In [123]:
training_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
training_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

print("Training rmse: ", training_rmse)
print("Test rmse: ", test_rmse)
print("Training r2: ", training_r2)
print("Test r2: ", test_r2)

Training rmse:  6039.6018713245385
Test rmse:  6063.122656850452
Training r2:  0.7545557492633161
Test r2:  0.7405989316927211


In [124]:
target = "charges"
# Feature matrix, does not contain the target var
X = df.drop(columns=[target]) 

# label - dependent variable
y = df[target] 

# One hot encoding for categorical variables
X_dummy = pd.get_dummies(X, drop_first=True) 

# splitting the data into training and test sets. 
# training dataset - we build the model
# testing dataset - we validate or test the model performance
X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(X_dummy, y
                     , test_size = 0.3 # controls the size of test data ... it a number between 0 and 1
                     , random_state = 1 # reproduce the the randomness
                    )
                    
scaler = preprocessing.StandardScaler()
scaler.fit(X_train) # calculates the mean and std for each column
X_train_std = scaler.transform(X_train) # calculates the z -score based 
X_test_std = scaler.transform(X_test) # calculates the z-score by re-using the mean and std of the training dataset


lr = linear_model.LinearRegression() # helps find the theta values 
lr.fit(X_train_std, y_train) # Number of theta depends on the features in X

# Calculate the prediction
y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)

training_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
training_r2 = metrics.r2_score(y_train, y_train_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

print("Training rmse: ", training_rmse)
print("Test rmse: ", test_rmse)
print("Training r2: ", training_r2)
print("Test r2: ", test_r2)




Training rmse:  6039.6018713245385
Test rmse:  6063.122656850452
Training r2:  0.7545557492633161
Test r2:  0.7405989316927211
