In [None]:
# import the necessary libraries for the project.

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv).
import numpy as np # linear algebra.
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings 
warnings.filterwarnings("ignore")

In [None]:
#Read the data which should be placed in the Desktop and the name of the file is 'creditcard.csv'.

df = pd.read_csv('Desktop/creditcard.csv')

In [None]:
# The first five rows from out dataset.

df.head()

In [None]:
# The original shape of the dataset.

df.shape

In [None]:
# The names of the columns of the dataset.

df.columns

In [None]:
#  The description of our dataset. 

df.describe()

In [None]:
# More information about our data for example tha type of it's column.

df.info()

In [None]:
#checking if there are any missing data in our dataset, luckily there is any.

df.isnull().sum()

In [None]:
# Determine number of fraud cases in dataset.
# valid transaction if Class = 0 and Fraud if Class = 1. 


valid = len(df[df['Class'] == 0])
fraud= len(df[df['Class'] == 1])

Outlier_Fraction = (fraud/valid)*100

print('OutlierFraction is :' , Outlier_Fraction) 
print('Valid Transactions:'  , valid ) 
print('Fraud Transactions:'  , fraud )

In [None]:
# Information about the valid transactions only.

print('Amount details of valid transaction')

valid_info= df[(df['Class']==0)]
valid_info.Amount.describe()

In [None]:
# Information about the fraud transactions only.

print('Amount details of fraud transaction')

fraud_info = df[df['Class'] ==1]
fraud_info.Amount.describe()

Note: Notice how imbalanced is our original dataset. Most of the transactions are non-fraud. If we use this dataframe as the base for our predictive models and analysis we might get a lot of errors and our algorithms will probably overfit since it will "assume" that most transactions are not fraud. But we don't want our model to assume, we want our model to detect patterns that give signs of fraud!



In [None]:
#Checking for the fraud and valid trasactions in a graph for better understanding. 

count_classes = pd.value_counts(df['Class'], sort = True ).sort_index()
count_classes.plot(kind = 'bar' ,rot = 0 ,colormap ='plasma')

plt.title ( "Fraud Class histogram" )
plt.xlabel( "Class" )
plt.ylabel( "Frequency" )

In [None]:
# Correlation matrix of the original dataset, we will not use this in order to find our outliers.
# We use it only for a quick view and understanding the original dataset. 

corrmat = df.corr() 
fig = plt.figure(figsize = (20, 13)) 
sns.heatmap(corrmat, cmap='viridis', vmax = 1, vmin=-0.5 , square = True , linewidths= 0.05)
plt.show() 

In [None]:
#Distribution of our dataset via Time.

sns.kdeplot(df['Time'])
plt.show()

In [None]:
#Distribution of our dataset via Amount of transaction.  

sns.kdeplot(df['Amount'])
plt.show()

Our Vi columns are already scaled that's why we only scaled the columns 'Amount' and 'Time'.
After this we must remove the old columns and replace them with the new, making a new dataset with the right values.

In [None]:
from sklearn.preprocessing import StandardScaler

stc = StandardScaler()

df['scaled_amount'] = stc.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = stc.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Amount' , 'Time'] , axis = 1 , inplace = True )

scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount' , 'scaled_time'] , axis = 1 , inplace = True )

df.insert(0 , 'scaled_amount' , scaled_amount)
df.insert(1 , 'scaled_time' , scaled_time)

df.head()

Now, we will produce a sub-sample from the previous dataset which will contain randomly 492 valid transactions 
and 492 fraud transactions.
We must do this because in the beginning of this notebook we saw that the original dataframe was heavily imbalanced.
By doing this we help our algorithms better understand patterns that determines whether a transaction is a fraud or not 
which is our target.


In [None]:
df = df.sample(frac=1)

fraud_df =  df[df['Class'] ==1]
valid_df = df.loc[np.random.choice(df.index, 492, replace=False)]

normal_distributed_df = pd.concat([fraud_df, valid_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)


new_df = pd.DataFrame(new_df)
new_df.head()

In [None]:
#Our new dataset which contain 984 transactions in random order (492 valid and 492 fraud).

normal_distributed_df1 = pd.concat([fraud_df ,valid_df], axis=0)
new_df = normal_distributed_df1.sample(frac=1, random_state=42)

new_df.shape

In [None]:
# We present the new dataset of the equally possible transactions.

print('Distribution of the Classes in the new dataset')

sns.countplot('Class', data=new_df)
sns.color_palette("Set2", 2)
print('\n')
plt.title('Equally Distributed Classes',fontsize=13)
plt.show()

In [None]:
# Showing ratio.
# Our equally likely data as we can see below. 

print("Percentage of normal transactions: ", (len(new_df[new_df['Class']==0])/len(new_df)))
print("Percentage of fraud transactions: ", (len(new_df[new_df['Class']==1])/len(new_df)))
print("Total number of transactions in resampled data: ", len(new_df))

In [None]:
#We will not use this heatmap as a reference.

corr = df.corr()
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(corr, cmap='viridis', annot_kws={'size':20},linewidths= 0.05)
ax.set_title("Old Correlation Matrix \n (don't use for reference)", fontsize=13)
plt.show()

In [None]:
#We will use this heatmap as a reference.

corr1 = new_df.corr()
fig, ax = plt.subplots(figsize=(20,10))
ax.set_title('SubSample Correlation Matrix \n (use for reference)', fontsize=13)
sns.heatmap(corr, cmap='viridis', annot_kws={'size':20}, linewidths= 0.05)
plt.show()

In [None]:
# Find the positive correletaion which can be our outliers.
# Positive correlation: The higher the feature value the probability increases that it will be a fraudulent transaction.

f, axes = plt.subplots(ncols=5, figsize=(25,15))

sns.boxplot(x='Class' , y ='V3',  data = new_df, ax=axes[0])
sns.boxplot(x='Class' , y ='V7',  data = new_df, ax=axes[1])
sns.boxplot(x='Class' , y ='V14', data = new_df, ax=axes[2])
sns.boxplot(x='Class' , y ='V17', data = new_df, ax=axes[3])
sns.boxplot(x='Class' , y ='V20', data = new_df, ax=axes[4])

print('Positive Correlation Boxtplos')
plt.show()

IQR METHOD 

In [None]:
# REMOVE OF THE EXTREME OUTLIERS FROM TOP 2 POSITIVE CORRELATION 
# FIRST THE V20 

v20_fraud = new_df['V20'].loc[new_df['Class'] == 1 ].values

q25 = np.percentile(v20_fraud , 25)
q75 = np.percentile(v20_fraud , 75)

print('The 25th Quartile is :' , q25)
print('The 75th Quantile is :' , q75)
print('\n')

v20_iqr = q75 - q25

print('The IQR of V20 is :' , v20_iqr)

v20_off = 1.5 * v20_iqr
v20_lower = q25 - v20_off
v20_upper = q75 + v20_off

print('The v20 that we will remove is :' , v20_off)
print('The min point is :' , v20_lower)
print('The max point is :' , v20_upper)
print('\n')

outliers_v20 = [i for i in v20_fraud if i < v20_lower or i > v20_upper ]
new_df_v20 = new_df.drop(new_df[(new_df['V20'] > v20_upper) | (new_df['V20'] < v20_lower)].index)

print('The number of the outliers is : ', len(outliers_v20))
print('The number of transactions after the outliers removes is :' , len(new_df_v20))
print('The new dataset after we remove the outliers of v20 is :' , new_df_v20.shape)

#984 - 41 = 943 , so our model applies good so far

In [None]:
# We can compare our boxplots before and after we remove the outliers of V20. 

f,(ax1, ax2) = plt.subplots(1, 2, figsize=(10,10))

sns.boxplot(x="Class", y="V20", data=new_df, ax=ax1)
sns.boxplot(x="Class", y="V20", data=new_df_v20, ax=ax2)

print('Before and After we apply the IQR method and remove the outliers of V20')
print('\n')

In [None]:
# REMOVE OF THE EXTREME OUTLIERS FROM TOP 2 POSITIVE CORRELATION 
# FIRST THE V3

v3_fraud = new_df['V3'].loc[new_df['Class'] == 1 ].values

q25 = np.percentile(v3_fraud , 25)
q75 = np.percentile(v3_fraud , 75)

print('The 25th Quartile is :' , q25)
print('The 75th Quantile is : ' , q75)
print('\n')

v3_iqr = q75 - q25

print('The IQR of V3 is :' , v3_iqr)
print('\n')
v3_off = 1.5 * v3_iqr
v3_lower = q25 - v3_off
v3_upper = q75 + v3_off

print('The v3 tha we will remove is :' , v3_off)
print('The min point is :' , v3_lower)
print('The max point is :' , v3_upper)
print('\n')

outliers_v3 = [i for i in v3_fraud if i < v3_lower or i > v3_upper ]
new_df_v3 = new_df.drop(new_df[(new_df['V3'] > v3_upper) | (new_df['V3'] < v3_lower)].index)

print('The number of the outliers is : ', len(outliers_v3))
print('The number of transactions after the outliers removes is :' , len(new_df_v3))
print('The new dataset after we remove the outliers of v3 is :' ,new_df_v3.shape)

#984 - 53 = 931 , so our model is really good

In [None]:
# We can compare our boxplots before and after we remove the outliers of V3 

f,(ax1, ax2) = plt.subplots(1, 2, figsize=(10,10))

sns.boxplot(x="Class", y="V3", data=new_df,ax=ax1)
sns.boxplot(x="Class", y="V3", data=new_df_v3,ax=ax2)

print('Before and After we apply the IQR method and remove the outliers of V3')
print('\n')

In [None]:
# Negative correlation: The lower the feature value,the probability decreases that it will be a fraudulent transaction.

f, axes = plt.subplots(ncols=5, figsize=(25,10))

sns.boxplot(x='Class' , y ='V10', data = new_df, ax=axes[0])
sns.boxplot(x='Class' , y ='V12', data = new_df, ax=axes[1])
sns.boxplot(x='Class' , y ='V14', data = new_df, ax=axes[2])
sns.boxplot(x='Class' , y ='V16', data = new_df, ax=axes[3])
sns.boxplot(x='Class' , y ='V17', data = new_df, ax=axes[4])

print('Negative Correlation Boxtplos')
plt.show()

In [None]:
# REMOVE OF THE EXTREME OUTLIERS FROM TOP 2 NEGATIVE CORRELATION 
# FIRST THE V10

v10_fraud = new_df['V10'].loc[new_df['Class'] == 1 ].values

q25 = np.percentile(v10_fraud , 25)
q75 = np.percentile(v10_fraud , 75)

print('The 25th Quartile is :' , q25)
print('The 75th Quantile is :' , q75)
print('\n')

v10_iqr = q75 - q25

print('The IQR of V10 is :' , v10_iqr)

v10_off = 1.5* v10_iqr
v10_lower = q25 - v10_off
v10_upper = q75 + v10_off

print('The v10 tha we will remove is :' , v10_off)
print('The min point is :' , v10_lower)
print('The max point is :' , v10_upper)
print('\n')

outliers_v10 = [i for i in v10_fraud if i < v10_lower or i > v10_upper ]
new_df_v10 = new_df.drop(new_df[(new_df['V10'] > v10_upper) | (new_df['V10'] < v10_lower)].index)

print('The number of the outliers is : ', len(outliers_v10))
print('The number of transactions after the outliers removes is :' , len(new_df_v10))
print('The new dataset after we remove the outliers of v10 is :' ,new_df_v10.shape)

In [None]:
# We can compare our boxplots before and after we remove the outliers of V10

f,(ax1, ax2) = plt.subplots(1, 2, figsize=(13,10))

sns.boxplot(x="Class", y="V10", data=new_df, ax = ax1 )
sns.boxplot(x="Class", y="V10", data=new_df_v10, ax = ax2 )

print('Before and After we apply the IQR method and remove the outliers of V10')
print('\n')

In [None]:
# REMOVE OF THE EXTREME OUTLIERS FROM TOP 2 NEGATIVE CORRELATION 
# FIRST THE V10

v14_fraud = new_df['V14'].loc[new_df['Class'] == 1 ].values

q25 = np.percentile(v14_fraud , 25)
q75 = np.percentile(v14_fraud , 75)

print('The 25th Quartile is :' , q25)
print('The 75th Quantile is : ' , q75)
print('\n')

v14_iqr = q75 - q25

print('The IQR of V14 is :' , v14_iqr)

v14_off = 1.5* v14_iqr
v14_lower = q25 - v14_off
v14_upper = q75 + v14_off

print('The v14 tha we will remove is :' , v14_off)
print('The min point is :' , v14_lower)
print('The max point is :' , v14_upper)
print('\n')

outliers_v14 = [i for i in v14_fraud if i < v14_lower or i > v14_upper ]
new_df_v14 = new_df.drop(new_df[(new_df['V14'] > v14_upper) | (new_df['V14'] < v14_lower)].index)

print('\n')

print('The number of the outliers is : ', len(outliers_v14))
print('The number of transactions after the outliers removes is :' , len(new_df_v14))
print('The new dataset after we remove the outliers of v14 is :' ,new_df_v14.shape)

#984 - 5 = 979 , so our model is real good 

In [None]:
# We can compare our boxplots before and after we remove the outliers of V14

f,(ax1, ax2) = plt.subplots(1, 2, figsize=(10,10))

sns.boxplot(x="Class", y="V14", data=new_df,ax=ax1)
sns.boxplot(x="Class", y="V14", data=new_df_v14,ax=ax2)

print('Before and After we apply the IQR method and remove the outliers of V14')
print('\n')

In [None]:
X = df.drop(['Class'], axis=1)
y = df['Class']

In [None]:
# Spliting the dataset in train set and test set, using sklearn libraly.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import (confusion_matrix, precision_recall_curve, classification_report, 
                            precision_score, recall_score, accuracy_score,f1_score,roc_auc_score)

from sklearn.model_selection import cross_val_score

In [None]:
# ISOLATION FOREST TREE MODEL

from sklearn.ensemble import IsolationForest

isf = IsolationForest(max_samples = len(X)).fit(X) # Fitting the model.
y_prediction = isf.predict(X)                      # Prediction using trained model.

# The isolation forest use (-1,1) in order to predict the result.
# But we have valid = 0 and fraud = 1 , so we must make some changes  before we run it. 

y_prediction[y_prediction == 1] = 0                # Valid transactions are labelled as 0.
y_prediction[y_prediction == -1] = 1               # Fraudulent transactions are labelled as 1.

errors = (y_prediction != y).sum()                 # Total number of errors is calculated.

print('The errors of the Isolation Forest model is ', errors)

print('\n')
print("Model Accuracy:", round(accuracy_score(y_prediction , y),2))
print("Model Precision:", round(precision_score(y_prediction , y),2))
print("Model Recall:", round(recall_score(y_prediction , y),2))
print("Model F1-Score:", round(f1_score(y_prediction , y),2))
print("Model ROC:", round(roc_auc_score(y_prediction , y),2))


print(classification_report(y_prediction , y))


conf_matrix=confusion_matrix(y_prediction,y )
print('\n')
labels= ['Valid', 'Fraud'] 
plt.figure(figsize=(6, 6)) 
sns.heatmap(pd.DataFrame(conf_matrix), xticklabels= labels, yticklabels= labels, 
            linewidths= 0.05 ,annot=True, fmt="d" , cmap='BuPu')

plt.title("Isolation Forest Classifier - Confusion Matrix") 
plt.ylabel('True Value') 
plt.xlabel('Predicted Value') 
plt.show()

In [None]:
# RANDOM_FOREST_CLASSIFIER MODEL 

from sklearn.ensemble import RandomForestClassifier


rfc=RandomForestClassifier(random_state = 42) 
rfc.fit(X_train, y_train) 
Y_pred=rfc.predict(X_test) 


print("Model Accuracy:", round(accuracy_score(y_test, Y_pred),2))
print("Model Precision:", round(precision_score(y_test, Y_pred),2))
print("Model Recall:", round(recall_score(y_test, Y_pred),2))
print("Model F1-Score:", round(f1_score(y_test, Y_pred),2))
print("Model ROC:", round(roc_auc_score(y_test, Y_pred),2))

conf_matrix=confusion_matrix(y_test, Y_pred) 
labels= ['Valid', 'Fraud'] 
plt.figure(figsize=(6, 6)) 

sns.heatmap(pd.DataFrame(conf_matrix), xticklabels= labels, yticklabels= labels, 
            linewidths= 0.05 ,annot=True, fmt="d" , cmap='BuPu')

print(classification_report(y_test, Y_pred, target_names=labels))


plt.title("Random Forest Classifier - Confusion Matrix") 
plt.ylabel('True Value') 
plt.xlabel('Predicted Value') 
plt.show()

In [None]:
# LOGISTIC_REGRESSION MODEL

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state = 42)
logreg.fit(X_train, y_train)
Y_pred1 = logreg.predict(X_test)

print("Model Accuracy:", round(accuracy_score(y_test, Y_pred1),2))
print("Model Precision:", round(precision_score(y_test, Y_pred1),2))
print("Model Recall:", round(recall_score(y_test, Y_pred1),2))
print("Model F1-Score:", round(f1_score(y_test, Y_pred1),2))


conf_matrix1 = confusion_matrix(y_test, Y_pred1)
plt.figure(figsize=(6, 6)) 
labels= ['Valid', 'Fraud'] 

sns.heatmap(pd.DataFrame(conf_matrix1),annot=True, fmt='d',
            linewidths= 0.05 ,cmap='BuPu',xticklabels= labels, yticklabels= labels)

print(classification_report(y_test, Y_pred1, target_names=labels))

plt.title('Logistic Regression - Confusion Matrix')
plt.ylabel('True Value')
plt.xlabel('Predicted Value')
plt.show()

In [None]:
# DECISSION TREE CLASSIFIER MODEL

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state = 42)
dtc.fit(X_train,y_train)
Y_pred2 = dtc.predict(X_test)
conf_matrix2 = confusion_matrix(y_test , Y_pred2)

print("Model Accuracy:", round(accuracy_score(y_test, Y_pred2),2))
print("Model Precision:", round(precision_score(y_test, Y_pred2),2))
print("Model Recall:", round(recall_score(y_test, Y_pred2),2))
print("Model F1-Score:", round(f1_score(y_test, Y_pred2),2))

conf_matrix2 = confusion_matrix(y_test, Y_pred2)
plt.figure(figsize=(6, 6))
labels= ['Valid', 'Fraud'] 

sns.heatmap(pd.DataFrame(conf_matrix2),annot=True, fmt='d',linewidths= 0.05 ,cmap='BuPu',
            xticklabels= labels, yticklabels= labels)

print(classification_report(y_test,Y_pred2,target_names=labels))

plt.title('Decission Tree Classifier - Confusion Matrix')
plt.ylabel('True Value') 
plt.xlabel('Predicted Value') 
plt.show()

In [None]:
# NAIVE BAYES CLASSIFIER

from sklearn.naive_bayes import BernoulliNB

NB = BernoulliNB()
NB.fit(X_train,y_train)
Y_pred3 = NB.predict(X_test)
conf_matrix_nb = confusion_matrix(y_test , Y_pred3)

print("Model Accuracy:", round(accuracy_score(y_test, Y_pred3),2))
print("Model Precision:", round(precision_score(y_test, Y_pred3),2))
print("Model Recall:", round(recall_score(y_test, Y_pred3),2))
print("Model F1-Score:", round(f1_score(y_test, Y_pred3),2))

conf_matrix2 = confusion_matrix(y_test, Y_pred3)
plt.figure(figsize=(6, 6))
labels= ['Valid', 'Fraud'] 

sns.heatmap(pd.DataFrame(conf_matrix2),annot=True, fmt='d',linewidths= 0.05 ,cmap='BuPu',
            xticklabels= labels, yticklabels= labels)

print(classification_report(y_test,Y_pred2,target_names=labels))

plt.title('Naive Bayes Classifier - Confusion Matrix')
plt.ylabel('True Value') 
plt.xlabel('Predicted Value') 
plt.show()

In [None]:
# SVC CLASSIFIER MODEL

from sklearn.svm import SVC

svc_clf = SVC()
svc_clf.fit(X_train,y_train)

Y_pred4 = svc_clf.predict(X_test)

conf_matrix_svm = confusion_matrix(y_test,Y_pred4)

print("Model Accuracy:", round(accuracy_score(y_test, Y_pred4),2))
print("Model Precision:", round(precision_score(y_test, Y_pred4),2))
print("Model Recall:", round(recall_score(y_test,Y_pred4),2))
print("Model F1-Score:", round(f1_score(y_test, Y_pred4),2))

plt.figure(figsize=(6, 6))
labels= ['Valid', 'Fraud'] 

sns.heatmap(pd.DataFrame(conf_matrix2),annot=True, fmt='d',linewidths= 0.05 ,cmap='BuPu',
            xticklabels= labels, yticklabels= labels)

print(classification_report(y_test,Y_pred2,target_names=labels))

plt.title('SVC Classifier - Confusion Matrix')
plt.ylabel('True Value') 
plt.xlabel('Predicted Value') 
plt.show()