## Predicting the Optimal APR for e-Car
### Nomis Solutions - LT 12

### Part 2 - Ridge Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

### Reading Data

In [2]:
raw = pd.read_excel('NomisB.xlsx', na_values=' ')

In [3]:
raw.head()

Unnamed: 0,Tier,FICO,Approve Date,Term,Amount,Previous Rate,Car Type,Competition rate,Outcome,Rate,Cost of Funds,Partner Bin
0,1,743,2002-07-01,36,19100.0,,N,4.95,1,4.85,1.8388,1
1,1,752,2002-07-01,60,36460.08,,N,5.65,1,5.49,1.8388,1
2,1,778,2002-07-01,48,12000.0,,U,5.85,1,5.85,1.8388,3
3,2,724,2002-07-01,60,19290.0,,N,5.65,1,5.39,1.8388,3
4,2,700,2002-07-01,72,24461.12,,N,6.25,1,6.99,1.8388,3


In [4]:
print(raw.shape)
raw.columns

(208085, 12)


Index(['Tier', 'FICO', 'Approve Date', 'Term', 'Amount', 'Previous Rate',
       'Car  Type', 'Competition rate', 'Outcome', 'Rate', 'Cost of Funds',
       'Partner Bin'],
      dtype='object')

In [None]:
df = raw.copy()

# Previous Rate NA = 0
df = df.fillna(0)

# Drop date
df = df.drop('Approve Date', axis=1)

# Partner Bin is categorical
df['Partner Bin'] = df['Partner Bin'].astype('category')
df = pd.get_dummies(df)

# Drop Amount that is too small
df = df[df.Amount>10]

### Segmenting Data based on Tiers

In [14]:
# combi = (Outcome, Tier)
combi = [(1,1),(1,2),(1,3),(1,4),(0,1),(0,2),(0,3),(0,4)]

Xy = {i : { 
            'X' : df.groupby(['Outcome', 'Tier']).get_group(i).drop(['Outcome','Rate'], axis=1),
            'y' : df.groupby(['Outcome', 'Tier']).get_group(i).Rate
          }
      for i in combi}

### Applying Ridge Regression

In [44]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [45]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)



# X_transition = pd.DataFrame(df_UPCAT)
# X_clean = DataFrameImputer().fit_transform(X_transition)

# X_clean.isnull().sum()

In [46]:
def train_ridge(X, y):
    C = [1e-8, 1e-4, 1e-3, 1e-2, 0.1, 0.2,0.4, 0.75, 1, 1.5, 3, 5, 10, 15,  20, 100, 300, 1000, 5000]
    #C = [0.01]

    score_train = []
    score_test = []
    weighted_coefs=[]
    
    for seed in range(Number_trials):
        training_accuracy = []  
        test_accuracy = []
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)
        for alpha_run in C:
            ridge = Ridge(alpha=alpha_run).fit(X_train, y_train)
            training_accuracy.append(ridge.score(X_train, y_train))
            test_accuracy.append(ridge.score(X_test, y_test))
            #if alpha_run == 0.01:
            coefs=ridge.coef_ 
            weighted_coefs.append(coefs) #append all the computed coefficients per trial
                
        score_train.append(training_accuracy)
        score_test.append(test_accuracy)
    
    mean_coefs=np.mean(weighted_coefs, axis=0) #get the mean of the weighted coefficients over all the trials 
    #print(mean_coefs) 
    score = np.mean(score_test, axis=0)
    #return score.shape
    #coefs = lr.coef_
    #return C value, accuracy, column/feature name
    
    if scaler == 'ON':
        top_predictor=X_clean.columns[np.argmax(np.abs(mean_coefs))]
        abs_mean_coefs = np.abs(mean_coefs[0,:])
        coefs_count = len(abs_mean_coefs)
        fig, ax = plt.subplots(figsize=(3,5))
        ax.barh(np.arange(coefs_count), sorted(abs_mean_coefs))
        #ax.barh(np.arange(coefs_count), mean_coefs[0, np.argsort(abs_mean_coefs)])
        ax.set_yticks(np.arange(coefs_count))
        ax.set_yticklabels(X_clean.columns[np.argsort(abs_mean_coefs)])
   
    if scaler == 'OFF':
        top_predictor=X.columns[np.argmax(np.abs(mean_coefs))]
        abs_mean_coefs = np.abs(mean_coefs[0,:])
        coefs_count = len(abs_mean_coefs)
        fig, ax = plt.subplots(figsize=(3,5))
        ax.barh(np.arange(coefs_count), sorted(abs_mean_coefs))
        #ax.barh(np.arange(coefs_count), mean_coefs[0, np.argsort(abs_mean_coefs)])
        ax.set_yticks(np.arange(coefs_count))
        ax.set_yticklabels(X.columns[np.argsort(abs_mean_coefs)])
        
    return ['Ridgge', np.amax(score), \
            'C = {0}'.format(C[np.argmax(score)])]

In [61]:
X_1 = scaler.fit_transform(X_1)
train_logistic(X_1, y_1)

['Logistic (Ridge)', 0.75806238637038, 'C = 1']

In [62]:
X_2 = scaler.fit_transform(X_2)
train_ridge(X_2, y_2)

['Logistic (Ridge)', 0.37408844412330033, 'C = 5']

In [63]:
X_3 = scaler.fit_transform(X_3)
train_ridge(X_3, y_3)

['Logistic (Ridge)', 0.3824678499314161, 'C = 1e-08']

In [64]:
X_4 = scaler.fit_transform(X_4)
train_ridge(X_4, y_4)

['Logistic (Ridge)', 0.2824667820736828, 'C = 1e-08']

| TIER       | Accuracy (Ridge)        | Best Parameter |
| ------------- |:-------------:| -----:|
|1 | 0.75806238637038 | C = 1 |
|2    |0.37408844412330033       |   C = 5 |
| 3 | 0.3824678499314161   |    C = 1e-08 |
| 4 | 0.2824667820736828   |    C = 1e-08 |

**Rejecting Ridge since the accuracy is lower than the other regression techniques**