In [81]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn import preprocessing
from sklearn import model_selection
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import  GridSearchCV

In [61]:
#Importing data
data = pd.read_csv('final_data.csv')

In [62]:
#Seperating into X and Y
X = data.iloc[:, 1: -1]
Y = data.iloc[:, -1]

In [63]:
#finding correlated features
correlated_features = set()
correlation_matrix = X.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.75:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [64]:
#Dropping features that have a high correlation and splitting into train and test sets
X = X.drop(correlated_features,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [65]:
#Appplying Oversampling
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [66]:
#Using RFE to eliminate features that are not useful
clf=RandomForestClassifier()
rfecv = RFE(estimator=clf)
rfecv.fit(X_train_res, y_train_res)


RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=4, max_features=20,
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=50, n_jobs=None,
                                     oob_score=False, random_state=None,
                                     verbose=0, warm_start=False),
    n_features_to_select=None, step=1, verbose=0)

In [67]:
print('Optimal number of features: {}'.format(rfecv.n_features_))

Optimal number of features: 23


In [68]:
#Stripping features from the dataset
print(np.where(rfecv.support_ == False)[0])

X_train_res.drop(X_train_res.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)

X_test.drop(X_test.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)

[ 0 10 15 16 17 18 19 20 21 22 23 24 25 26 27 29 30 32 33 37 38 40 41]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [80]:
y_train_res

0         1
1         1
2         1
3         1
4         1
         ..
284323    0
284324    0
284325    0
284326    0
284327    0
Name: loan_condition, Length: 284328, dtype: int64

In [None]:
#Performing Grid Search to find most suitable parameters
parameters = {
    'n_estimators': [200,300,400,500],
    'max_depth' : [2,3,4,5,6,7,8,9],
    'max_features' : ['auto', 'sqrt', 'log2']
             }

scoreFunction = {"recall": "recall", "precision": "precision"}
MOD = RandomForestClassifier() 
random_search = GridSearchCV(MOD,
                                   param_grid = parameters, 
                                   scoring = scoreFunction,               
                                   refit = "precision",
                                   return_train_score = True,
                                   cv = 5, n_jobs=-1) 
random_search.fit(X_train_res, y_train_res)

In [None]:
random_search.best_params_

In [17]:
#Running and fitting the Randomforest Classifier
clf=RandomForestClassifier(n_estimators=300,max_depth=9,max_features='auto')
clf.fit(X_train_res,y_train_res)

y_pred=clf.predict(X_test)

In [None]:
#Finding out the metrics and scores from the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print('Precision:', precision)
print('Recall:', recall)
print('Accuracy:', acc)
print('Confusion Matrix:', cm)
