# Money Laundering Detection with various models and an interactive UI

## First let's work on our data 

In [None]:
#imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
data1=pd.read_csv('data\ML.csv')
data2=pd.read_csv('data\MLtag.csv')

: 

In [None]:
# Knowing the data
data1.head()

: 

In [None]:
# Knowing the data
data2.head()

: 

In [None]:
# data cleaning
data1.isnull().sum()

: 

In [None]:
# data cleaning
data2.isnull().sum()

: 

In [None]:
#data cleaning 
data1.dropna(inplace=True)

: 

In [None]:
#data cleaning
data2.dropna(inplace=True)

: 

In [None]:
# get day month and year and hour and minutes from the date
data1['date']=pd.to_datetime(data1['date'])
data1['year']=data1['date'].dt.year
data1['month']=data1['date'].dt.month
data1['day']=data1['date'].dt.day
data1['hour']=data1['date'].dt.hour
data1['minute']=data1['date'].dt.minute
data1.head()

: 

In [None]:
#type of action  and typeoffraud to categorical
data1['typeofaction']=data1['typeofaction'].astype('category')
data1['typeofaction']=data1['typeofaction'].cat.codes
data1['typeoffraud']=data1['typeoffraud'].astype('category')
data1['typeoffraud']=data1['typeoffraud'].cat.codes
data1.head()

: 

In [None]:
data2['levelofcrime'].value_counts()

: 

In [None]:
# data2 levelofcrime to categorical
data2['levelofcrime']=data2['levelofcrime'].astype('category')
data2['levelofcrime']=data2['levelofcrime'].cat.codes
data2.head()

: 

In [None]:
data2['levelofcrime'].value_counts()

: 

In [None]:
data1.drop('typeoffraud',axis=1,inplace=True)
data1.head()

: 

In [None]:
# percentage of isfraud values
data1['isfraud'].value_counts()

: 

In [None]:
sns.set_style("whitegrid");
sns.pairplot(data1, hue="isfraud");
plt.show()

: 

In [None]:
sns.countplot(data1['typeofaction'].unique())

: 

In [None]:
sns.countplot(data1['sourceid'].unique())

: 

In [None]:
sns.countplot(data1['destinationid'].unique())

: 

In [None]:
sns.countplot(data1['amountofmoney'].unique())

: 

In [None]:
sns.countplot(data1['date'].unique())

: 

In [None]:
sns.countplot(data1['isfraud'].unique())

: 

In [None]:
data1.drop('date',axis=1,inplace=True)

: 

In [None]:
#correlation matrix
plt.figure(figsize=(10,10))
sns.heatmap(data1.corr(),annot=True)

: 

In [None]:
#feature selection
X=data1.drop('isfraud',axis=1)
y=data1['isfraud']


: 

# Let's start the model

In [None]:
#feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
bestfeatures = SelectKBest(score_func=chi2, k=9)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

: 

In [None]:
featureScores.hist()

: 

In [None]:
#KEEPING THE BEST FEATURES
X=data1[['typeofaction','amountofmoney','hour','minute']]
y=data1['isfraud']

: 

In [None]:
#pca analysis
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
                , columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, data1[['isfraud']]], axis = 1)
finalDf.head()

: 

In [None]:
#plotting pca
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0,1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['isfraud'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

: 

In [None]:
y.value_counts()

: 

In [None]:
#oversampling
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_res, y_res = sm.fit_resample(X, y.ravel())

: 

In [None]:
#Random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
#plot random forest classifier
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
sns.heatmap(cm, annot=True)

: 

In [None]:
#best parameters for random forest classifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)   
grid_search.fit(X_train, y_train)
grid_search.best_params_

: 

In [None]:
#random forest classifier with best parameters
rfc = RandomForestClassifier(bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
#create a pickle file using serialization
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(rfc, pickle_out)
pickle_out.close()

: 

In [None]:
#XGBOOST 
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
# XGBOOSt feature importance with names of features
from xgboost import plot_importance
from matplotlib import pyplot
plot_importance(xgb)
pyplot.show()

: 

In [None]:
#accuracy of XGBOOST
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
sns.heatmap(cm, annot=True)

: 

In [None]:
#best parameters for XGBOOST
from sklearn.model_selection import GridSearchCV
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
}
xgb = XGBClassifier()
grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)           
grid_search.fit(X_train, y_train)
grid_search.best_params_

: 

In [None]:
#XGBOOST with best parameters
xgb = XGBClassifier(learning_rate=0.2, max_depth=6, n_estimators=400)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
#create a pickle file using serialization
import pickle
pickle_out = open("classifier2.pkl","wb")
pickle.dump(xgb, pickle_out)
pickle_out.close()

: 

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
# logistic regression confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
sns.heatmap(cm, annot=True)

: 

In [None]:
#logistic regression important features
importance = logreg.coef_[0]
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
plt.bar([x for x in range(len(importance))], importance)
plt.show()

: 

In [None]:
#best parameters for logistic regression
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2']
}
logreg = LogisticRegression()
grid_search = GridSearchCV(estimator = logreg, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)   
grid_search.fit(X_train, y_train)
grid_search.best_params_

: 

In [None]:
#logistic with best parameters
logreg = LogisticRegression(C=0.001, penalty='l2')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
#create a pickle file using serialization
import pickle
pickle_out = open("classifier3.pkl","wb")
pickle.dump(logreg, pickle_out)
pickle_out.close()

: 

In [None]:
#svm
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
#metrics of svm
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
sns.heatmap(cm, annot=True)

: 

In [None]:
# best parametres for svm
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf']
}
svm = SVC()
grid_search = GridSearchCV(estimator = svm, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_


: 

In [None]:
#svm with best parametrs 
svm = SVC(C=1, gamma=0.0001, kernel='rbf')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
#create a pickle file using serialization
import pickle
pickle_out = open("classifier4.pkl","wb")
pickle.dump(svm, pickle_out)
pickle_out.close()

: 

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
#metrics of knn
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
sns.heatmap(cm, annot=True)

: 

In [None]:
#best parameters for knn
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(estimator = knn, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

: 

In [None]:
#KNN with best parameters
knn = KNeighborsClassifier(algorithm='auto', leaf_size=10, n_neighbors=10, weights='distance')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

: 

In [None]:
#create a pickle file using serialization
import pickle
pickle_out = open("classifier5.pkl","wb")
pickle.dump(knn, pickle_out)
pickle_out.close()

: 

In [None]:
#KNN feature importance
importance = knn.kneighbors(X_train, n_neighbors=10, return_distance=False)
print(importance)

: 

In [None]:
#plot of feature importance
plt.figure(figsize=(10, 10))
plt.plot(importance[0], 'o')
plt.xticks(range(X_train.shape[-1]),rotation=90)
plt.ylabel('n_neighbors')
plt.xlabel('features')
plt.title('Feature importance plot')
plt.show()

: 

In [None]:
#effect of neighbors on accuracy
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

: 

In [None]:
#neural network
# import Adam 
from keras.optimizers import Adam
import tensorflow as tf
Model1 = tf.keras.Sequential(
 [
 tf.keras.layers.Dense(32, activation="relu", input_shape=(X_train.shape[-1],)),
 tf.keras.layers.Dense(128, activation="relu"),
 tf.keras.layers.Dropout(0.3),
 tf.keras.layers.Dense(256, activation="relu"),
 tf.keras.layers.Dropout(0.3),
 tf.keras.layers.Dense(1, activation="sigmoid"),
 ]
)

#Compile Model
Model1.compile(
 optimizer=tf.keras.optimizers.Adam(0.005), 
 loss="binary_crossentropy", 
 metrics=["acc"]
)

#Fit Model
history1 = Model1.fit(
 X_train,
 y_train,
 batch_size=2048,
 epochs=100,
 verbose=2,
 validation_split = 0.1)

: 

In [None]:
# plot lossand accuracy of neural network
plt.plot(history1.history['loss'], label='train')
plt.plot(history1.history['val_loss'], label='test')
plt.legend()
plt.show()
plt.plot(history1.history['acc'], label='train')
plt.plot(history1.history['val_acc'], label='test')
plt.legend()
plt.show()


: 

In [None]:
#metrics of neural network
y_pred = Model1.predict(X_test)
y_pred = (y_pred > 0.5)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

: 

In [None]:
#pickle file for NN
import pickle
pickle_out = open("classifier7.pkl","wb")
pickle.dump(Model1, pickle_out)
pickle_out.close()

: 

In [None]:
#cross validation comparison on all models
from sklearn.model_selection import cross_val_score
accuracies=cross_val_score(estimator=rfc,X=X_train,y=y_train,cv=10)
print("Random Forest Classifier Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
accuracies=cross_val_score(estimator=xgb,X=X_train,y=y_train,cv=10)
print("XGBOOST Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
accuracies=cross_val_score(estimator=logreg,X=X_train,y=y_train,cv=10)
print("Logistic Regression Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
accuracies=cross_val_score(estimator=svm,X=X_train,y=y_train,cv=10)
print("SVM Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
accuracies=cross_val_score(estimator=knn,X=X_train,y=y_train,cv=10)
print("KNN Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

: 

In [None]:
X_train.info()

: 

: 