In [11]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from IPython.display import Image
from six import StringIO
from sklearn.tree import export_graphviz

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [12]:
df = pd.read_csv("CleanDATA.csv")

In [13]:
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,278,CASH_IN,330218.42,20866.0,351084.42,452419.57,122201.15,0
1,15,PAYMENT,11647.08,30370.0,18722.92,0.0,0.0,0
2,10,CASH_IN,152264.21,106589.0,258853.21,201303.01,49038.8,0
3,403,TRANSFER,1551760.63,0.0,0.0,3198359.45,4750120.08,0
4,206,CASH_IN,78172.3,2921331.58,2999503.88,415821.9,337649.6,0


In [14]:
df.shape

(100000, 8)

In [15]:
categorical_columns = []
for column in df.columns:
    if df[column].dtype == object and len(df[column].unique()) <= 50:
        categorical_columns.append(column)
label = LabelEncoder()
for column in categorical_columns:
    df[column] = label.fit_transform(df[column])

In [16]:
X = df.drop("isFraud", axis=1)
y = df["isFraud"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [18]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

In [19]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0      1  accuracy  macro avg  weighted avg
precision      1.0    1.0       1.0        1.0           1.0
recall         1.0    1.0       1.0        1.0           1.0
f1-score       1.0    1.0       1.0        1.0           1.0
support    69899.0  101.0       1.0    70000.0       70000.0
_______________________________________________
Confusion Matrix: 
 [[69899     0]
 [    0   101]]

Test Result:
Accuracy Score: 99.93%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999666   0.731707    0.9993      0.865687      0.999309
recall         0.999633   0.750000    0.9993      0.874816      0.999300
f1-score       0.999650   0.740741    0.9993      0.870195      0.999304
support    29960.000000  40.000000    0.9993  30000.000000  30000.000000
______________

In [20]:
params = {
    "criterion":("gini", "entropy"), 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}

tree_clf = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(
    tree_clf, 
    params, 
    scoring="f1", 
    n_jobs=-1, 
    verbose=1, 
    cv=5
)

tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

tree_clf = DecisionTreeClassifier(**best_params)
tree_clf.fit(X_train, y_train)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Fitting 5 folds for each of 4332 candidates, totalling 21660 fits
Best paramters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'})
Train Result:
Accuracy Score: 99.97%
_______________________________________________
CLASSIFICATION REPORT:
                      0           1  accuracy     macro avg  weighted avg
precision      0.999685    0.987500  0.999671      0.993593      0.999668
recall         0.999986    0.782178  0.999671      0.891082      0.999671
f1-score       0.999836    0.872928  0.999671      0.936382      0.999652
support    69899.000000  101.000000  0.999671  70000.000000  70000.000000
_______________________________________________
Confusion Matrix: 
 [[69898     1]
 [   22    79]]

Test Result:
Accuracy Score: 99.95%
_______________________________________________
CLASSIFICATION REPORT:
                      0          1  accuracy     macro avg  weighted avg
precision      0.999600   0.933333  0.999533      0.96

In [21]:
rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)

In [22]:
print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0      1  accuracy  macro avg  weighted avg
precision      1.0    1.0       1.0        1.0           1.0
recall         1.0    1.0       1.0        1.0           1.0
f1-score       1.0    1.0       1.0        1.0           1.0
support    69899.0  101.0       1.0    70000.0       70000.0
_______________________________________________
Confusion Matrix: 
 [[69899     0]
 [    0   101]]

Test Result:
Accuracy Score: 99.96%
_______________________________________________
CLASSIFICATION REPORT:
                      0         1  accuracy     macro avg  weighted avg
precision      0.999633   1.00000  0.999633      0.999816      0.999633
recall         1.000000   0.72500  0.999633      0.862500      0.999633
f1-score       0.999816   0.84058  0.999633      0.920198      0.999604
support    29960.000000  40.00000  0.999633  30000.000000  30000.000000
___________________

In [23]:
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth, 
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf, 
    'bootstrap': bootstrap
}

rf_clf = RandomForestClassifier(random_state=42)
rf_cv = RandomizedSearchCV(
    estimator=rf_clf, 
    scoring='f1',
    param_distributions=random_grid, 
    n_iter=200, 
    cv=5, 
    verbose=1, 
    random_state=42,
    n_jobs=-1
)

rf_cv.fit(X_train, y_train)
rf_best_params = rf_cv.best_params_
print(f"Best paramters: {rf_best_params})")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


460 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
123 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\babur\Anaconda3\envs\phase1\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\babur\Anaconda3\envs\phase1\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\babur\Anaconda3\envs\phase1\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\babur\Anaconda3\envs\phase1\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in valid

Best paramters: {'n_estimators': 1600, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': False})


In [24]:
rf_clf = RandomForestClassifier(**rf_best_params)
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0      1  accuracy  macro avg  weighted avg
precision      1.0    1.0       1.0        1.0           1.0
recall         1.0    1.0       1.0        1.0           1.0
f1-score       1.0    1.0       1.0        1.0           1.0
support    69899.0  101.0       1.0    70000.0       70000.0
_______________________________________________
Confusion Matrix: 
 [[69899     0]
 [    0   101]]

Test Result:
Accuracy Score: 99.96%
_______________________________________________
CLASSIFICATION REPORT:
                      0         1  accuracy     macro avg  weighted avg
precision      0.999633   1.00000  0.999633      0.999816      0.999633
recall         1.000000   0.72500  0.999633      0.862500      0.999633
f1-score       0.999816   0.84058  0.999633      0.920198      0.999604
support    29960.000000  40.00000  0.999633  30000.000000  30000.000000
___________________