## Predicting the Optimal APR for e-Car
### Nomis Solutions - LT 12

### Part 2 - Random Forest Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from tqdm.autonotebook import tqdm
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV



### Reading Data

In [9]:
raw = pd.read_excel('NomisB.xlsx', na_values=' ')

In [157]:
print(raw.shape)
raw.columns

(208085, 12)


Index(['Tier', 'FICO', 'Approve Date', 'Term', 'Amount', 'Previous Rate',
       'Car  Type', 'Competition rate', 'Outcome', 'Rate', 'Cost of Funds',
       'Partner Bin'],
      dtype='object')

In [158]:
df = raw.copy()

# Previous Rate NA = 0
df = df.fillna(0)

# Drop date
df = df.drop('Approve Date', axis=1)

# Partner Bin is categorical
df['Partner Bin'] = df['Partner Bin'].astype('category')
df = pd.get_dummies(df)

# Drop Amount that is too small
df = df[df.Amount>10]

### Segmenting Data based on Tiers

In [159]:
# combi = (Outcome, Tier)
combi = [(1,1),(1,2),(1,3),(1,4),(0,1),(0,2),(0,3),(0,4)]

Xy = {i : { 
            'X' : df.groupby(['Outcome', 'Tier']).get_group(i).drop(['Outcome','Rate','Tier'], axis=1),
            'y' : df.groupby(['Outcome', 'Tier']).get_group(i).Rate
          }
      for i in combi}

In [162]:
list(range(1,5))

[1, 2, 3, 4]

### Applying Random Forest

In [163]:
results = {}
for i in tqdm(range(1,5)):
    X = Xy[(1,i)]['X']
    y = Xy[(1,i)]['y']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

    param_grids = {'max_depth': [2, 5, 10, 20, 30],
                   'n_estimators': [10, 20, 50] }   

    est = RandomForestRegressor(random_state=0)
    gs_cv = GridSearchCV(est, param_grids, n_jobs=-1, cv=5).fit(X_train, y_train)
    results[(1,i)] = {
        'model' : gs_cv,
        'best_params' : gs_cv.best_params_,
        'acc' : gs_cv.score(X_test, y_test)
    }

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

In [164]:
add_profit ={}
for i in tqdm(range(1,5)):
    exp = pd.concat([Xy[(0,i)]['X'].reset_index(drop=True), 
                 Xy[(0,i)]['y'].reset_index(drop=True),
                 pd.DataFrame({'Predicted rate' :np.round(results[(1,i)]['model'].predict(Xy[(0,i)]['X']), 2)})], axis=1)
    temp = exp[(exp['Predicted rate'] < exp['Rate']) & (exp['Predicted rate'] < exp['Competition rate']) ]
    
    add_profit[(0,i)] = {
        'Captured':  temp.shape[0],
        'Captured pct': round(temp.shape[0]/exp.shape[0]*100, 1),
        'Profit' : (temp.Amount * temp.Term/12 * (temp['Predicted rate'] - temp['Cost of Funds'])/100).sum()
    }

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

In [165]:
sum([add_profit[i]['Profit'] for i in add_profit])/1000000

212.72276645833642

### Calculating Profits for each of the Segments for Outcome 1

In [166]:
add_profit

{(0, 1): {'Captured': 41147,
  'Captured pct': 54.1,
  'Profit': 188583765.8242477},
 (0, 2): {'Captured': 3843, 'Captured pct': 10.7, 'Profit': 22309107.90925261},
 (0, 3): {'Captured': 341, 'Captured pct': 1.1, 'Profit': 1824160.624836115},
 (0, 4): {'Captured': 1, 'Captured pct': 0.0, 'Profit': 5732.0999999999985}}

In [167]:
[results[i]['acc'] for i in results]

[0.9020883779290694,
 0.8821695815569348,
 0.8592910813705846,
 0.8553037883010983]

| TIER       | Accuracy (RF)        |
| ------------- |:-------------:|
|1 | 0.9020883779290694 |
|2    |0.8821695815569348       |
| 3 | 0.8592910813705846   |
| 4 | 0.8553037883010983   |

**Random Forest Regression accuracies slightly lower than the Gradient Boosting Method**