In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import r2_score

In [3]:
from sklearn.ensemble import RandomForestRegressor

In [4]:
df = pd.read_csv('./data/Credit.csv')

In [5]:
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,14.891,3606,283,2,34,11,No,No,Yes,South,333
1,106.025,6645,483,3,82,15,Yes,Yes,Yes,West,903
2,104.593,7075,514,4,71,11,No,No,No,West,580
3,148.924,9504,681,3,36,11,Yes,No,No,West,964
4,55.882,4897,357,2,68,16,No,No,Yes,South,331


In [6]:
cols = df.columns

In [7]:
cols

Index(['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education', 'Own',
       'Student', 'Married', 'Region', 'Balance'],
      dtype='object')

In [8]:
region_codes = {'North': 1, 'South': 2, 'East': 3, 'West': 4}

In [9]:
region_c = [region_codes[r] for r in df['Region']]

In [10]:
married_c = [1 if c == 'Yes' else 0 for c in df['Married'] ]
student_c =[1 if c == 'Yes' else 0 for c in df['Married'] ]
own_c = [1 if c == 'Yes' else 0 for c in df['Married'] ]

In [11]:
df['Region'] = region_c
df['Married'] = married_c
df['Student'] = student_c
df['Own'] = own_c
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,14.891,3606,283,2,34,11,1,1,1,2,333
1,106.025,6645,483,3,82,15,1,1,1,4,903
2,104.593,7075,514,4,71,11,0,0,0,4,580
3,148.924,9504,681,3,36,11,0,0,0,4,964
4,55.882,4897,357,2,68,16,1,1,1,2,331


In [12]:
# The target is to predict the Median House Value from the 8 features collected
X = df[['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education', 'Own',
       'Student', 'Married', 'Region']]
y = df[['Balance' ]]
len(X), len(y)

(400, 400)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Linear Regression Model

In [14]:
reg = LinearRegression()

In [15]:
reg.fit(X_train, y_train)

LinearRegression()

In [16]:
pred_linReg = reg.predict(X_test)

In [17]:
r2_linReg = r2_score(y_test, pred_linReg)
r2_linReg

0.866476314394418

In [18]:
linReg = {'r2_linReg': r2_linReg}

## Lasso Models L1 Regularization

In [19]:
reg_lso = Lasso()

In [20]:
reg_lso.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso()

In [21]:
pred_lso = reg_lso.predict(X_test)

In [22]:
r2_lasso = r2_score(y_test, pred_lso)
r2_lasso

0.8666219046655093

In [23]:
lasso = {'r2_lasso': r2_lasso}

### Reduce the amount of L1 Regularization

In [24]:
reg_lso_2 = Lasso(alpha=0.5)

In [25]:
reg_lso_2.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=0.5)

In [26]:
pred_lso_2 = reg_lso_2.predict(X_test)

In [27]:
r2_lasso_2 = r2_score(y_test, pred_lso_2)
r2_lasso_2

0.8665646484121408

In [28]:
lasso_2 = {'r2_lasso_2': r2_lasso_2}

## Ridge Model L2 Regularization

In [29]:
reg_ridge = Ridge()

In [30]:
reg_ridge.fit(X_train, y_train)

Ridge()

In [31]:
pred_ridge = reg_ridge.predict(X_test)

In [32]:
r2_ridge = r2_score(y_test, pred_ridge)
r2_ridge

0.8664884109167852

In [33]:
ridge = {'r2_ridge': r2_ridge}

### Reduced Ridge Regression

In [34]:
reg_ridge_2 = Ridge(alpha=0.5)

In [35]:
reg_ridge_2.fit(X_train, y_train)

Ridge(alpha=0.5)

In [36]:
pred_ridge_2 = reg_ridge_2.predict(X_test)

In [37]:
r2_ridge_2 = r2_score(y_test, pred_ridge_2)
r2_ridge_2

0.8664823932417149

In [38]:
ridge_2 = {'r2_ridge_2': r2_ridge_2}

## Compare non-linear model: Random Forest

In [39]:
reg_rf = RandomForestRegressor()

In [40]:
reg_rf.fit(X_train, y_train)
pred_reg_rf = reg_rf.predict(X_test)
r2_reg_rf = r2_score(y_test, pred_reg_rf)
r2_reg_rf

  reg_rf.fit(X_train, y_train)


0.8599866293456915

In [41]:
r2_rf = {'r2_reg_rf': r2_reg_rf}

## Compare the various models

In [42]:
obj = [linReg, lasso, lasso_2, ridge, ridge_2, r2_rf]

In [43]:
o2 = {}
names = []
for d in obj:
    for k,v in d.items():
        names.append(k)
        o2[k] = v

In [44]:
o2

{'r2_linReg': 0.866476314394418,
 'r2_lasso': 0.8666219046655093,
 'r2_lasso_2': 0.8665646484121408,
 'r2_ridge': 0.8664884109167852,
 'r2_ridge_2': 0.8664823932417149,
 'r2_reg_rf': 0.8599866293456915}

In [45]:
d2 = pd.DataFrame(o2, index=['R2'])

In [46]:
d2 = d2.T
d2

Unnamed: 0,R2
r2_linReg,0.866476
r2_lasso,0.866622
r2_lasso_2,0.866565
r2_ridge,0.866488
r2_ridge_2,0.866482
r2_reg_rf,0.859987


In [47]:
d3 = d2.sort_values(by='R2')
d3

Unnamed: 0,R2
r2_reg_rf,0.859987
r2_linReg,0.866476
r2_ridge_2,0.866482
r2_ridge,0.866488
r2_lasso_2,0.866565
r2_lasso,0.866622
