# Fraud Detection Project
### Ali Ebrahimi

## Part 1 : Reading Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

DF_Fraud = pd.read_csv('C:/Users/Ali/Desktop/UOttawa/Job/Interview/BrainFinance/data_scientist_exercise-master/fraud_prep.csv', sep= ',')

X = DF_Fraud.iloc[:, 0:-1]
X = np.array(X).astype(np.float)
y = DF_Fraud.iloc[:, -1]
y=np.array(y).astype(np.float)

class_names = {0:'Not Fraud', 1:'Fraud'}
print(DF_Fraud.Class.value_counts().rename(index = class_names))

DF_Fraud.describe()

## Part 2 : Visualizing Dataset

### Distributions

In [None]:
DF_Fraud.hist(figsize=(20,20))
plt.show()

### Class

In [None]:
plt.scatter(X[y == 0, 0], X[y == 0, 1], label="Not Fraud")
plt.scatter(X[y == 1, 0], X[y == 1, 1], label="Fraud", c='r')
plt.legend()
plt.show()

## Part 3 : Standardization Time & Amount, Resampleing (Synthetic Minority Over-sampling Technique (SMOTE)) & Visualizing Distributions

In [None]:
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import Normalizer

#data_norm = DF_Fraud.copy() 
DF_Fraud[['Time','Amount']] = StandardScaler().fit_transform(DF_Fraud[['Time','Amount']])
DF_Fraud.describe()

In [None]:
from imblearn.over_sampling import SMOTE

Resmpl = SMOTE(kind='regular')
X_Resmpl, y_Resmpl = Resmpl.fit_sample(X,y)
plt.scatter(X_Resmpl[y_Resmpl == 0, 0], X_Resmpl[y_Resmpl == 0, 1], label="Not Fraud")
plt.scatter(X_Resmpl[y_Resmpl == 1, 0], X_Resmpl[y_Resmpl == 1, 1], label="Fraud",c='r')
plt.legend()
plt.show()
print(pd.value_counts(pd.Series(y_Resmpl)))
print(pd.value_counts(pd.Series(y)))

In [None]:
DF_Fraud.hist(figsize=(20,20))
plt.show()

In [None]:
corr = DF_Fraud.corr().sort_values('Class', ascending=False)
correlation = corr[['Class']]
plt.figure(figsize=(12,20))
sns.heatmap(data=correlation,annot=True)
plt.title('Heatmap of Correlation')

## Part 4: Dimetion Reduction

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_result = pca.fit_transform(DF_Fraud)
DF_Fraud['Pca-one'] = pca_result[:,0]
DF_Fraud['Pca-two'] = pca_result[:,1] 

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16,10))

sns.scatterplot(
    x="Pca-one", y="Pca-two",
    hue="Class",
    data=DF_Fraud,
    legend="full",
    alpha=0.3)
plt.show()

## Part 5: Spilitting Test & Train

In [None]:
X = DF_Fraud.iloc[:, 0:-3]
y = DF_Fraud.iloc[:, -3]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X , y ,train_size=0.70, test_size=0.30, random_state=0)

## Part 6: Machine Learning Methods

### A - Supervised Learning

#### 6-1: KFold & Resampleing (Synthetic Minority Over-sampling Technique (SMOTE))

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from imblearn.over_sampling import SMOTE

Spilit = 10

kfold = KFold(n_splits=Spilit, random_state=123, shuffle = True)

results=[]
names=[]

#### 6-2: Logistic Regression

##### Cross Validation

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR_CV = cross_val_score(LR, X_Resmpl, y_Resmpl, cv=kfold, scoring='roc_auc')
results.append(LR_CV)
names.append('Logistic Regression')
LR_CV.mean()

##### Confusion Matrix

In [None]:
LR.fit(X_Resmpl, y_Resmpl)
LR_Predict = LR.predict(X_test)

class_names = ['Not_Fraud', 'Fraud']
matrix = confusion_matrix(y_test, LR_Predict)
dataframe = pd.DataFrame(matrix, index=class_names, columns=class_names)
sns.heatmap(dataframe, annot=True, cbar=None, cmap="Blues", fmt = 'g')
plt.title("Confusion Matrix"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

print(classification_report(y_test, LR_Predict, target_names=class_names))

TP=matrix[1,1]
TN=matrix[0,0]
FP=matrix[0,1]
FN=matrix[1,0]
acc = (TP+TN)/(TP+FP+TN+FN)
rec = TP/(TP+FN)
spe = TN/(TN+FP)
pre = TP/(TP+FP)

print("Accuracy: %.3f" % (acc))
print("Sensitivity (Recall): %.3f" % (rec))
print("Specificity: %.3f" % (spe))
print("Precision: %.3f" % (pre))

#### 6-3: KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
KNN_CV = cross_val_score(KNN, X_Resmpl, y_Resmpl, cv=kfold, scoring='roc_auc')
results.append(KNN_CV)
names.append('KNN')
KNN_CV.mean()

##### Confusion Matrix

In [None]:
KNN.fit(X_Resmpl, y_Resmpl)
KNN_Predict = KNN.predict(X_test)

class_names = ['Not_Fraud', 'Fraud']
matrix = confusion_matrix(y_test, KNN_Predict)
dataframe = pd.DataFrame(matrix, index=class_names, columns=class_names)
sns.heatmap(dataframe, annot=True, cbar=None, cmap="Blues", fmt = 'g')
plt.title("Confusion Matrix"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

print(classification_report(y_test, KNN_Predict, target_names=class_names))

TP=matrix[1,1]
TN=matrix[0,0]
FP=matrix[0,1]
FN=matrix[1,0]
acc = (TP+TN)/(TP+FP+TN+FN)
rec = TP/(TP+FN)
spe = TN/(TN+FP)
pre = TP/(TP+FP)

print("Accuracy: %.3f" % (acc))
print("Sensitivity (Recall): %.3f" % (rec))
print("Specificity: %.3f" % (spe))
print("Precision: %.3f" % (pre))

#### 6-4: Decision Tree

##### Cross Validation

In [None]:
from sklearn.tree import DecisionTreeClassifier
CART = DecisionTreeClassifier()
CART_CV = cross_val_score(CART, X_Resmpl, y_Resmpl, cv=kfold, scoring='roc_auc')
results.append(CART_CV)
names.append('Decision Tree')
CART_CV.mean()

##### Confusion Matrix


In [None]:
CART.fit(X_Resmpl, y_Resmpl)
CART_Predict = CART.predict(X_test)

class_names = ['Not_Fraud', 'Fraud']
matrix = confusion_matrix(y_test, CART_Predict)
dataframe = pd.DataFrame(matrix, index=class_names, columns=class_names)
sns.heatmap(dataframe, annot=True, cbar=None, cmap="Blues", fmt = 'g')
plt.title("Confusion Matrix"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

print(classification_report(y_test, CART_Predict, target_names=class_names))

TP=matrix[1,1]
TN=matrix[0,0]
FP=matrix[0,1]
FN=matrix[1,0]
acc = (TP+TN)/(TP+FP+TN+FN)
rec = TP/(TP+FN)
spe = TN/(TN+FP)
pre = TP/(TP+FP)

print("Accuracy: %.3f" % (acc))
print("Sensitivity (Recall): %.3f" % (rec))
print("Specificity: %.3f" % (spe))
print("Precision: %.3f" % (pre))

#### 6-5: SVC

##### Cross Validation

In [None]:
from sklearn.svm import SVC
SVM = SVC()

kfold_SVM = KFold(n_splits=3, random_state=123, shuffle = True)

SVM_CV = cross_val_score(SVM, X_Resmpl, y_Resmpl, cv=kfold_SVM, scoring='roc_auc')
results.append(SVM_CV)
names.append('SVC')
SVM_CV.mean()

##### Confusion Matrix


In [None]:
SVM.fit(X_Resmpl, y_Resmpl)
SVM_Predict = SVM.predict(X_test)

class_names = ['Not_Fraud', 'Fraud']
matrix = confusion_matrix(y_test, SVM_Predict)
dataframe = pd.DataFrame(matrix, index=class_names, columns=class_names)
sns.heatmap(dataframe, annot=True, cbar=None, cmap="Blues", fmt = 'g')
plt.title("Confusion Matrix"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

print(classification_report(y_test, SVM_Predict, target_names=class_names))

TP=matrix[1,1]
TN=matrix[0,0]
FP=matrix[0,1]
FN=matrix[1,0]
acc = (TP+TN)/(TP+FP+TN+FN)
rec = TP/(TP+FN)
spe = TN/(TN+FP)
pre = TP/(TP+FP)

print("Accuracy: %.3f" % (acc))
print("Sensitivity (Recall): %.3f" % (rec))
print("Specificity: %.3f" % (spe))
print("Precision: %.3f" % (pre))

#### 6-6: Random Forest

##### Cross Validation

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF_CV = cross_val_score(RF, X_Resmpl, y_Resmpl, cv=kfold, scoring='roc_auc')
results.append(RF_CV)
names.append('Random Forest')
RF_CV.mean()

##### Confusion Matrix

In [None]:
RF.fit(X_Resmpl, y_Resmpl)
RF_Predict = RF.predict(X_test)

class_names = ['Not_Fraud', 'Fraud']
matrix = confusion_matrix(y_test, RF_Predict)
dataframe = pd.DataFrame(matrix, index=class_names, columns=class_names)
sns.heatmap(dataframe, annot=True, cbar=None, cmap="Blues", fmt = 'g')
plt.title("Confusion Matrix"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

print(classification_report(y_test, RF_Predict, target_names=class_names))

TP=matrix[1,1]
TN=matrix[0,0]
FP=matrix[0,1]
FN=matrix[1,0]
acc = (TP+TN)/(TP+FP+TN+FN)
rec = TP/(TP+FN)
spe = TN/(TN+FP)
pre = TP/(TP+FP)

print("Accuracy: %.3f" % (acc))
print("Sensitivity (Recall): %.3f" % (rec))
print("Specificity: %.3f" % (spe))
print("Precision: %.3f" % (pre))

#### 6-7: Naive Bayes

##### Cross Validation

In [None]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB_CV = cross_val_score(NB, X_Resmpl, y_Resmpl, cv=kfold, scoring='roc_auc')
results.append(NB_CV)
names.append('Naive Bayes')
NB_CV.mean()

##### Confusion Matrix

In [None]:
NB.fit(X_Resmpl, y_Resmpl)
NB_Predict = NB.predict(X_test)

class_names = ['Not_Fraud', 'Fraud']
matrix = confusion_matrix(y_test, NB_Predict)
dataframe = pd.DataFrame(matrix, index=class_names, columns=class_names)
sns.heatmap(dataframe, annot=True, cbar=None, cmap="Blues", fmt = 'g')
plt.title("Confusion Matrix"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

print(classification_report(y_test, NB_Predict, target_names=class_names))

TP=matrix[1,1]
TN=matrix[0,0]
FP=matrix[0,1]
FN=matrix[1,0]
acc = (TP+TN)/(TP+FP+TN+FN)
rec = TP/(TP+FN)
spe = TN/(TN+FP)
pre = TP/(TP+FP)

print("Accuracy: %.3f" % (acc))
print("Sensitivity (Recall): %.3f" % (rec))
print("Specificity: %.3f" % (spe))
print("Precision: %.3f" % (pre))

## Algorithm Comparison

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### A - Unsupervised Learning
We can impeliment some methods like scatter matrix to figure out is our that we can solve our problem by using unsopervised methods like Kmeans.

#### 6-8: K-Means

In [None]:
from sklearn.cluster import MiniBatchKMeans 

kmeans = MiniBatchKMeans(n_clusters=10, random_state=0)
kmeans.fit(X_Resmpl)

#### 6-6: Testing Using Elbow Method

In [None]:
from sklearn.model_selection import train_test_split
clustno = range(1, 10)
kmeans = [MiniBatchKMeans(n_clusters=i) for i in clustno] 
score = [kmeans[i].fit(X_Resmpl).score(X_Resmpl) for i in range(len(kmeans))]
plt.plot(clustno, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve');

In [None]:
kmeans = MiniBatchKMeans(n_clusters=2, random_state=0)
kmeans.fit(X_Resmpl)
y_pred = kmeans.predict(X_test)
X_test_clusters_centers = kmeans.cluster_centers_

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

In [None]:
matrix = confusion_matrix(y_test, y_pred)
matrix

In [None]:
class_names = ['Not_Fraud', 'Fraud']
matrix = confusion_matrix(y_test, y_pred)
dataframe = pd.DataFrame(matrix, index=class_names, columns=class_names)
sns.heatmap(dataframe, annot=True, cbar=None, cmap="Blues", fmt = 'g')
plt.title("Confusion Matrix"), plt.tight_layout()
plt.ylabel("True Class"), plt.xlabel("Predicted Class")
plt.show()

print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
print(pd.value_counts(pd.Series(y_Resmpl)))
print(pd.value_counts(pd.Series(y)))