In [41]:
#imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [42]:
#sets
PATH = "../Datasets/Modified/mod_UCI_Credit_Card.csv"

In [43]:
df = pd.read_csv(PATH)

In [44]:
#defining X and y
X = df.drop("Default payment",axis=1)
y = df["Default payment"].values

In [45]:
#create an instance of RandomForestClassifier
rfc = RandomForestClassifier()

### Parameters :

__n_estimators__:int, default=100
The number of trees in the forest.

Changed in version 0.22: The default value of n_estimators changed from 10 to 100 in 0.22.

__criterion__:{“gini”, “entropy”}, default=”gini”
The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.

__max_depth__:int, default=None
The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

__min_samples_split__:int or float, default=2
The minimum number of samples required to split an internal node:

If int, then consider min_samples_split as the minimum number.

If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.

In [46]:
### set up the param grid
params = {"n_estimators" :[10,20,30,40,50,60,70,80,100,110]}

In [47]:
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=42, test_size=0.3)

In [48]:
#resplit manually
X_validation = X_train[:4084]
y_validation = y_train[:4084]
X_train = X_train[4084:]
y_train = y_train[4084:]

In [49]:
# create grid
# Setting up the grid search that will test every combination of parameters
gridsearch = GridSearchCV(estimator = rfc,
                        param_grid = params,
                        scoring = 'accuracy', 
                        cv = 5, # Use 5 folds
                        verbose = 2,
                        n_jobs = -1 #Use all but one CPU core
                        )

# As we are doing cross-validation on the training set, the testing set X_test is untouched
result = gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   30.7s finished


In [50]:
print("The best parameters are :", result.best_params_)
print("The best accuracy is {:.2f}%:".format(result.best_score_ * 100))

# Using a re_split validation test
decision_tree = result.best_estimator_
score = decision_tree.score(X_validation, y_validation)
print("The generalization accuracy of the model is {:.2f}%".format(score * 100))

The best parameters are : {'n_estimators': 100}
The best accuracy is 81.58%:
The generalization accuracy of the model is 81.22%


In [51]:
#finally testing on test:
decision_tree = result.best_estimator_
score = decision_tree.score(X_test, y_test)
print("The generalization accuracy of the model is {:.2f}%".format(score * 100))

The generalization accuracy of the model is 81.27%


In [52]:
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=42, test_size=0.3)

In [54]:
#re-set model
rfc = RandomForestClassifier(n_estimators= 100)

In [55]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc, X, y, cv=5) # cv is the number of folds (k)
print(scores)

# It is always a good practice to show the mean AND the standard deviation of the model accuracy
print("Accuracy: {:.2f}% (+/- {:.2f})".format(scores.mean() * 100, scores.std() * 100))

[0.80260452 0.80551748 0.81388175 0.83016281 0.82142245]
Accuracy: 81.47% (+/- 1.02)
