In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# ML Algorithm & Metrics Libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline


from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Read the Data
data = pd.read_csv('creditcard.csv')
data.head()

In [None]:
data.shape

In [None]:
# feature engineering
X = data.drop(['Class','Time'], axis = 1)
Y = data[['Class']]

In [None]:
# split into training & test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.40, random_state=0)

In [None]:
# standardise features
sc = StandardScaler()
    
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.feature_selection import mutual_info_classif
mutual_information = mutual_info_classif(X_train, y_train, n_neighbors=5, copy = True)

plt.subplots(1, figsize=(28, 1))
sns.heatmap(mutual_information[:, np.newaxis].T, cmap='Blues', cbar=False, linewidths=1, annot=True, annot_kws={"size": 12})
plt.yticks([], [])
plt.gca().set_xticklabels(X.columns, rotation=45, ha='right', fontsize=12)
plt.suptitle("Variable Importance (mutual_info_classif)", fontsize=12, y=1.2)
plt.gcf().subplots_adjust(wspace=0.4)

From the above plot,we see that some variables are more important than others, so We'll select these for our classification

selected variables are: V3, V4, V7, V9, V10, V11, V12, V14, V16, V17, V18

In [None]:
# specific variable selection
X_opt = data[['V3','V4','V7','V9','V10','V11','V12','V14','V16','V17','V18']]
# split into training & test data
Xopt_train, Xopt_test, y_train, y_test = train_test_split(X_opt, Y, test_size=0.40, random_state=0)
# standardise input
Xopt_train = sc.fit_transform(Xopt_train)
Xopt_test = sc.transform(Xopt_test)

In [None]:
def train_model(model):
    m = model[1]
    y_train_pred = cross_val_predict(model[1], Xopt_train, y_train, cv=5)
    # cm = confusion_matrix(y_train, y_train_pred)
    # print('Confusion matrix: ' + model[0])
    # print(cm)
    # print()
    accuracy = accuracy_score(y_train, y_train_pred)
    precision = precision_score(y_train, y_train_pred)
    recall = recall_score(y_train, y_train_pred)
    f1 = f1_score(y_train, y_train_pred)
    print(f'{model[0]} Accuracy: {accuracy}')
    print(f'{model[0]} Precision: {precision}')
    # print(f'{model[0]} Recall: {recall}')
    # print(f'{model[0]} f1 - score: {f1}')

In [None]:
train_model(('Gaussian Naive Bayes', GaussianNB()))

In [None]:
train_model(('Random Forest', RandomForestClassifier(n_estimators=3)))

In [None]:
rfc = RandomForestClassifier(n_estimators=3, random_state=0, verbose=0)

In [None]:
# training
rfc.fit(Xopt_train, y_train) 

In [None]:
# prediction
pred_rfc = rfc.predict(Xopt_test)
acc_score = '{:.2%}'.format(accuracy_score(y_test,pred_rfc))
pre_score = '{:.2%}'.format(precision_score(y_test,pred_rfc))
print(f"The Accuracy Score for Random Forest Classifier Model is {acc_score} with a Precision Score of {pre_score}")

Randomized search on hyper parameters.

In [None]:
parameter_grid={'n_estimators':[1,2,3,4,5],'max_depth':[2,4,6,8,10],'min_samples_leaf': 
[1,2,4],'max_features':[1,2,3,4,5,6,7,8]}

pipeline= Pipeline([
   ('clf',RandomForestClassifier(n_estimators=3, random_state=0))
])

number_models=4
random_RandomForest_class=RandomizedSearchCV(
estimator=pipeline['clf'],
param_distributions=parameter_grid,
n_iter=number_models,
scoring='accuracy',
n_jobs=2,
cv=4,
refit=True,
return_train_score=True)

random_RandomForest_class.fit(Xopt_train, y_train)
predictions=random_RandomForest_class.predict(Xopt_test)

In [None]:
acc_score_upd = '{:.2%}'.format(accuracy_score(y_test,predictions))
print("Updated Accuracy Score: ",acc_score_upd);
print("Best params",random_RandomForest_class.best_params_)
print("Best score",random_RandomForest_class.best_score_)