## Predicting the Optimal APR for e-Car
### Nomis Solutions - LT 12

### Part 2 - Ridge Regression

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from tqdm.autonotebook import tqdm
from tqdm import trange
from time import sleep
from sklearn.model_selection import learning_curve, GridSearchCV


### Reading Data

In [2]:
raw = pd.read_excel('NomisB.xlsx', na_values=' ')

In [3]:
print(raw.shape)
raw.columns

(208085, 12)


Index(['Tier', 'FICO', 'Approve Date', 'Term', 'Amount', 'Previous Rate',
       'Car  Type', 'Competition rate', 'Outcome', 'Rate', 'Cost of Funds',
       'Partner Bin'],
      dtype='object')

In [5]:
df = raw.copy()

# Previous Rate NA = 0
df = df.fillna(0)

# Drop date
df = df.drop('Approve Date', axis=1)

# Partner Bin is categorical
df['Partner Bin'] = df['Partner Bin'].astype('category')
df = pd.get_dummies(df)

# Drop Amount that is too small
df = df[df.Amount>10]

### Segmenting Data based on Tiers

In [6]:
# combi = (Outcome, Tier)
combi = [(1,1),(1,2),(1,3),(1,4),(0,1),(0,2),(0,3),(0,4)]

Xy = {i : { 
            'X' : df.groupby(['Outcome', 'Tier']).get_group(i).drop(['Outcome','Rate'], axis=1),
            'y' : df.groupby(['Outcome', 'Tier']).get_group(i).Rate
          }
      for i in combi}

In [20]:
def svr_reg(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)
    SV=LinearSVR(epsilon=0.2, C=2.0)
    SV.fit(X_train,y_train)
    y_pred_SVR = SV.predict(X_test)
    return [SV.score(X_train, y_train), SV.score(X_test, y_test)]

In [21]:
X1 = Xy[1,1]['X'].drop('Tier', axis=1)
y1 = Xy[1,1]['y']

X2 = Xy[1,2]['X'].drop('Tier', axis=1)
y2 = Xy[1,2]['y']

X3 = Xy[1,3]['X'].drop('Tier', axis=1)
y3 = Xy[1,3]['y']

X4 = Xy[1,4]['X'].drop('Tier', axis=1)
y4 = Xy[1,4]['y']

### Linear Support Vector Regression

In [24]:
cols = ['Train Accuracy', 'Test Accuracy']
df2 = pd.DataFrame(columns=cols)

df2.loc[1] = svr_reg(X1, y1)
df2.loc[2] = svr_reg(X2, y2)
df2.loc[3] = svr_reg(X3, y3)
df2.loc[4] = svr_reg(X4, y4)
df2




Unnamed: 0,Train Accuracy,Test Accuracy
1,0.321532,0.309449
2,-3.430527,-3.344043
3,-5.187751,-4.775225
4,-0.026164,-0.032138


In [29]:
def svr_reg_rbf(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1 )
    SV=SVR(kernel='rbf', C=1.0, epsilon=0.2)
    SV.fit(X_train,y_train)
    y_pred_SVR = SV.predict(X_test)
    return ['Tier', SV.score(X_train, y_train), SV.score(X_test, y_test)]

### Non-Linear Support Vector Regression (RBF)

In [32]:
cols = ['Tier','Train Accuracy', 'Test Accuracy']
df2 = pd.DataFrame(columns=cols)

df2.loc[1] = svr_reg_rbf(X1, y1)
df2.loc[2] = svr_reg_rbf(X2, y2)
df2.loc[3] = svr_reg_rbf(X3, y3)
df2.loc[4] = svr_reg_rbf(X4, y4)
df2



Unnamed: 0,Tier,Train Accuracy,Test Accuracy
1,Tier,0.846327,0.078933
2,Tier,0.873798,0.010525
3,Tier,0.742217,-0.012593
4,Tier,0.662752,0.001385


In [33]:
def svr_reg_poly(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1 )
    SV=SVR(kernel='poly', C=1.0, epsilon=0.2)
    SV.fit(X_train,y_train)
    y_pred_SVR = SV.predict(X_test)
    return ['Tier', SV.score(X_train, y_train), SV.score(X_test, y_test)]

In [None]:
cols = ['Tier','Train Accuracy', 'Test Accuracy']
df3 = pd.DataFrame(columns=cols)

df3.loc[1] = svr_reg_poly(X1, y1)
df3.loc[2] = svr_reg_poly(X2, y2)
df3.loc[3] = svr_reg_poly(X3, y3)
df3.loc[4] = svr_reg_poly(X4, y4)
df3



**Rejecting SVR since the accuracy is lower than the other regression techniques**