### ASSIGNMENT 2

In [4]:
import pandas as pd
import numpy as np
df_universalBank = pd.read_csv('UniversalBank.csv')

df_universalBank.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [5]:
df_universalBank['ZIP Code'].nunique()
df_universalBank.shape

(5000, 14)

In [6]:
# dropping unwanted columns
df_universalBank = df_universalBank.drop(columns=['ID','ZIP Code'],axis=1)
df_universalBank.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [7]:
# creating X (independent features) and Y (target)
X = df_universalBank.drop(columns=['Personal Loan'])
y = df_universalBank['Personal Loan']

In [83]:
# Splitting the data into training and testing (using stratified split to get better results)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)


In [66]:
# function to calculate profit, using this function as custom scorer in grid search
def profit_scorer(clf, X, y_true):
  y_pred = clf.predict(X)
  profit_matrix = {'TP': 90, 'FP': -10, 'TN':0, 'FN': -90}
  TP = ((y_pred == 1) & (y_true == 1)).sum()
  FP = ((y_pred == 1) & (y_true == 0)).sum()
  TN = ((y_pred == 0) & (y_true == 0)).sum()
  FN = ((y_pred == 0) & (y_true == 1)).sum()
  profit = (profit_matrix['TP']*TP +
             profit_matrix['FP']*FP +
             profit_matrix['TN']*TN +
             profit_matrix['FN']*FN)
  return profit

In [1]:
import numpy as np
np.logspace(-2,2,5)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

### Logistic regression


In [67]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-2,2,5), "penalty":["l1","l2"]}# l1 lasso l2 ridge

logistic = LogisticRegression(solver='liblinear')

log_clf = GridSearchCV(logistic,grid,cv=5,scoring=profit_scorer)

print('Best Parameters: ', log_clf.best_params_)


Best Parameters:  {'C': 10.0, 'penalty': 'l1'}


In [68]:
best_log_clf = log_clf.best_estimator_
best_log_model = best_log_clf.fit(X_train,y_train)
y_pred = best_log_model.predict(X_test)


from sklearn.metrics import confusion_matrix
cf = confusion_matrix(y_test,y_pred)

profit_matrix = {'TP': 90, 'FP': -10, 'TN':0, 'FN': -90}
num_TP = cf[1][1]
num_FP = cf[0][1]
num_TN = cf[0][0]
num_FN = cf[1][0]
net_profit = (profit_matrix['TP']*num_TP +
             profit_matrix['FP']*num_FP +
             profit_matrix['TN']*num_TN +
             profit_matrix['FN']*num_FN)
net_profit

3840

* Using Gridsearch we found that the Logistic regression model with 'C' = 10 and penalty 'l1' (Lasso regression) gives better profit when compared to other hyperparameter combinations. 
* The logisitic regression model built, gives a profit of $3840 on the 1500 members in the test dataset. 

### RandomForest

In [69]:
from sklearn.ensemble import RandomForestClassifier

# The parameters grid
grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
}

rfc = RandomForestClassifier()

rfc_clf = GridSearchCV(estimator=rfc, param_grid=grid, cv=5, scoring = profit_scorer)

rfc_clf.fit(X_train,y_train)


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 10, 20, 30],
                         'n_estimators': [50, 100, 200]},
             scoring=<function profit_scorer at 0x7fc500491f70>)

In [77]:
print('Best Parameters for Random Forest Classifier: ', rfc_clf.best_params_)


Best Parameters for Random Forest Classifier:  {'max_depth': None, 'n_estimators': 200}


In [79]:
best_rfc_clf = rfc_clf.best_estimator_
best_rfc_model = best_rfc_clf.fit(X_train,y_train)
y_pred_rfc = best_rfc_model.predict(X_test)

In [80]:
from sklearn.metrics import confusion_matrix
cf_rfc = confusion_matrix(y_test,y_pred_rfc)

profit_matrix = {'TP': 90, 'FP': -10, 'TN':0, 'FN': -90}
num_TP = cf_rfc[1][1]
num_FP = cf_rfc[0][1]
num_TN = cf_rfc[0][0]
num_FN = cf_rfc[1][0]
net_profit_rfc = (profit_matrix['TP']*num_TP +
             profit_matrix['FP']*num_FP +
             profit_matrix['TN']*num_TN +
             profit_matrix['FN']*num_FN)
net_profit_rfc

9890

* Using GridSearch we found that the Randomforest model with n_estimators=200 and max_depth=None gives the most profit.
* The profit that can be made using the Randomforest model built is $9890 for the 1500 members in test dataset. 

### Conclusion:
#### Randomforest model does a better job in identifying the target customers and gives the best profit. 