![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)

# <center> Machine Learning Methods </center>
## <center> Exercise 04 - Insurance Regression - Solution </center>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/MachineLearningMethod/Exercises/Exercise04_Regression_Solution.ipynb)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import numpy             as np
import pandas            as pd
import seaborn           as sns
import matplotlib.pyplot as plt

### Get data:
From Kaggle: https://www.kaggle.com/mirichoi0218/insurance

In [3]:
dData = pd.read_csv('insurance.csv')
dData

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


#### Change columns names:

In [4]:
dData.columns = ['Age', 'Sex', 'BMI', 'NumberOfChildren', 'Smoker', 'Region', 'Charges']
dData

Unnamed: 0,Age,Sex,BMI,NumberOfChildren,Smoker,Region,Charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


#### Create dummies variables:

In [5]:
dData = pd.get_dummies(dData, drop_first=True)

#-- Move charges to the last columns:
lCols = dData.columns.drop('Charges').tolist() + ['Charges']
dData = dData[lCols]
dData

Unnamed: 0,Age,BMI,NumberOfChildren,Sex_male,Smoker_yes,Region_northwest,Region_southeast,Region_southwest,Charges
0,19,27.900,0,0,1,0,0,1,16884.92400
1,18,33.770,1,1,0,0,1,0,1725.55230
2,28,33.000,3,1,0,0,1,0,4449.46200
3,33,22.705,0,1,0,1,0,0,21984.47061
4,32,28.880,0,1,0,1,0,0,3866.85520
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,1,0,1,0,0,10600.54830
1334,18,31.920,0,0,0,0,0,0,2205.98080
1335,18,36.850,0,0,0,0,1,0,1629.83350
1336,21,25.800,0,0,0,0,0,1,2007.94500


In [6]:
# dTrainX = dData.drop(columns='Charges')
dTrainX = dData.drop(columns=['Charges', 'Region_northwest', 'Region_southeast', 'Region_southwest'])
dTrainY = dData['Charges']

mX = dTrainX.values
vY = dTrainY.values

mX -= np.mean(mX, axis=0)
mX /= np.std (mX, axis=0)
vY -= np.mean(vY)
vY /= np.std (vY)

mX.shape, vY.shape

((1338, 5), (1338,))

### Exercise 1:
* Train a linear regressor (with or without polynomial features).
* Try to get the best $R^2$ score with $K=50$ fold cross-validation.
* Hint: Consider using $L^1$ regularization (lasso).

In [7]:
from sklearn.preprocessing   import PolynomialFeatures
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics         import r2_score
from sklearn.linear_model    import LinearRegression
from sklearn.pipeline        import make_pipeline

#-- Cross validation regression:
dRes   = pd.DataFrame(columns=['P', 'R2'])
for P in [1, 2, 3]:
    oPolyFit = make_pipeline(
        PolynomialFeatures(degree=P, include_bias=False),
        LinearRegression  ()
    )
    vHatY               = cross_val_predict(oPolyFit, mX, vY, cv=KFold(50, shuffle=True))
    dRes.loc[len(dRes)] = [P, r2_score(vY, vHatY)]

dRes.sort_values(by='R2', ascending=False)

Unnamed: 0,P,R2
1,2.0,0.839179
2,3.0,0.834932
0,1.0,0.746876


In [8]:
from sklearn.linear_model import Lasso

#-- Cross validation regression with Lasso:
vLam = np.logspace(-2.5, 0, 15)
dRes = pd.DataFrame(columns=['P', 'lam', 'R2'])
for P in [1, 2, 3]:
    for lam in vLam:
        oLasso = make_pipeline(
            PolynomialFeatures(degree=P, include_bias=False),
            Lasso             (alpha=lam, max_iter=10000)
        )
        vHatY               = cross_val_predict(oLasso, mX, vY, cv=KFold(50, shuffle=True))
        dRes.loc[len(dRes)] = [P, lam, r2_score(vY, vHatY)]

dRes.sort_values(by='R2', ascending=False)

Unnamed: 0,P,lam,R2
30,3.0,0.003162,0.841141
33,3.0,0.010857,0.840906
32,3.0,0.007197,0.840858
31,3.0,0.004771,0.840777
34,3.0,0.016379,0.840272
16,2.0,0.004771,0.840236
17,2.0,0.007197,0.839849
15,2.0,0.003162,0.839682
18,2.0,0.010857,0.839266
35,3.0,0.024709,0.839084


### Exercise 2:
* Try non-parametric models.
* Hint: Consider splitting your data into two subsets and train two regressors (one for each set).
* Improve your $R^2$ score from exercise 1.

#### Regressor tree:

In [9]:
from sklearn.tree import DecisionTreeRegressor

vHatY = cross_val_predict(DecisionTreeRegressor(max_leaf_nodes=11), mX, vY, cv=KFold(len(vY), shuffle=True))
print(f'Tree = {r2_score(vY, vHatY)}')

Tree = 0.8438097797190999


#### Split on smokers:
`SplitRegressor` trains a tree on the smokers and Lasso on the non-smokers:

In [10]:
from sklearn.base import BaseEstimator

class SplitRegressor(BaseEstimator):
    def __init__(self, dTreeArg, dLassoArg):
        self.oTree      = DecisionTreeRegressor(**dTreeArg)
        self.oLasso     = Lasso                (**dLassoArg)
        
        self.smokerIdx  = 4
        self.dTreeArg   = dTreeArg
        self.dLassoArg  = dLassoArg
        
    def fit(self, mX, vY):
        vIdx = mX[:,self.smokerIdx] > 0
        
        #-- Smokers:
        mX1 = mX[vIdx, :]       
        vY1 = vY[vIdx]          
        self.oTree .fit(mX1, vY1)
        
        #-- Non-smokers:
        mX2 = mX[~vIdx,:]
        vY2 = vY[~vIdx]
        self.oLasso.fit(mX2, vY2)
        
    def predict(self, mX):
        vIdx = mX[:,self.smokerIdx] > 0
        
        #-- Smokers:
        mX1    = mX[vIdx, :]
        vHatY1 = None
        if mX1.shape[0] > 0:
            vHatY1 = self.oTree.predict(mX1)
        
        #-- Non-Smokers:
        mX2    = mX[~vIdx,:]
        vHatY2 = None
        if mX2.shape[0] > 0:
            vHatY2 = self.oLasso.predict(mX2)
        
        #-- Merge:
        vHatY        = np.full(mX.shape[0], np.nan)
        vHatY[vIdx]  = vHatY1
        vHatY[~vIdx] = vHatY2
        
        return vHatY
    
    def get_params(self, deep=True):
        return {'dTreeArg': self.dTreeArg, 'dLassoArg': self.dLassoArg}


In [11]:
dTreeArg  = {'max_leaf_nodes':11, 'random_state':0}
dLassoArg = {'alpha':0.010,       'max_iter':1000}
oReg      = SplitRegressor(dTreeArg, dLassoArg)
vHatY     = cross_val_predict(oReg, mX, vY, cv=KFold(len(vY), shuffle=True))
print(f'Split = {r2_score(vY, vHatY)}')

Split = 0.8531773120840362


### Gradient boosting:
We did not learn this (yet)

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

vHatY = cross_val_predict(GradientBoostingRegressor(n_estimators=100, learning_rate=0.05), dTrainX, dTrainY, cv=KFold(50, shuffle=True))
print(f'GBR = {r2_score(vY, vHatY)}')

GBR = 0.8622714429226772
