In [None]:
import pandas as pd
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss,average_precision_score,confusion_matrix,ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from imblearn.over_sampling import SMOTE

In [None]:
total_data = pd.read_csv('fraud_payment_data', sep=',', header=0) 
final_features=pd.read_csv('total_features', sep=',', header=0) 

In [None]:
total_data=total_data[total_data.USD_amount>0]
total_data=total_data.reset_index(drop=True)
total_data

In [None]:
final_features

In [None]:
##Model testing without accounting for class imbalance
log_reg = LogisticRegression(penalty=None)
xgb=XGBClassifier()
rf = RandomForestClassifier()

X=StandardScaler().fit_transform(train_features.values)
## fit the model
rf.fit(X,y_train)
xgb.fit(X,y_train)
log_reg.fit(X,y_train)
log_reg.coef_

In [None]:
train_features=final_features[0:1000000]
validate_features=final_features[1000000:1250000]
test_features=final_features[1250000:-1]

y_train=total_data['Label'][0:1000000]
y_validate=total_data['Label'][1000000:1250000]
y_test=total_data['Label'][1250000:-1]

In [None]:
##Validation
cutoff = 0.1
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(validate_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Logistic Regression with a cutoff of",cutoff,
      "is", np.sum(y_pred == y_validate)/len(y_validate), "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


y_pred=xgb.predict(StandardScaler().fit_transform(validate_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for XGBoost is", np.sum(y_pred == y_validate)/len(y_validate),  "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()


y_pred=rf.predict(StandardScaler().fit_transform(validate_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Random Forest is", np.sum(y_pred == y_validate)/len(y_validate),  "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {rf.__class__.__name__}')
plt.show()

In [None]:
#Testing
cutoff = 0.1
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(test_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Logistic Regression with a cutoff of",cutoff,
      "is", np.sum(y_pred == y_test)/len(y_test), "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


y_pred=xgb.predict(StandardScaler().fit_transform(test_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for XGBoost is", np.sum(y_pred == y_test)/len(y_test),  "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()


y_pred=rf.predict(StandardScaler().fit_transform(test_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Random Forest is", np.sum(y_pred == y_test)/len(y_test),  "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {rf.__class__.__name__}')
plt.show()

In [None]:
##Using SMOTE for handling class imbalance
smote=SMOTE()
X_res,y_res=smote.fit_resample(X,y_train)

In [None]:
log_reg = LogisticRegression(penalty=None)
xgb=XGBClassifier()
rf = RandomForestClassifier()

## fit the model
rf.fit(X_res,y_res)
xgb.fit(X_res,y_res)
log_reg.fit(X_res,y_res)
log_reg.coef_

In [None]:
#Validation
cutoff = 0.9
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(validate_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Logistic Regression with a cutoff of",cutoff,
      "is", np.sum(y_pred == y_validate)/len(y_validate), "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


y_pred=xgb.predict(StandardScaler().fit_transform(validate_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for XGBoost is", np.sum(y_pred == y_validate)/len(y_validate),  "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()


y_pred=rf.predict(StandardScaler().fit_transform(validate_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Random Forest is", np.sum(y_pred == y_validate)/len(y_validate),  "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {rf.__class__.__name__}')
plt.show()

In [None]:
##Test
cutoff = 0.1
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(test_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Logistic Regression with a cutoff of",cutoff,
      "is", np.sum(y_pred == y_test)/len(y_test), "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


y_pred=xgb.predict(StandardScaler().fit_transform(test_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for XGBoost is", np.sum(y_pred == y_test)/len(y_test),  "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()


y_pred=rf.predict(StandardScaler().fit_transform(test_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Random Forest is", np.sum(y_pred == y_test)/len(y_test),  "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {rf.__class__.__name__}')
plt.show()

In [None]:
##Manually duplicating fraud data points to deal with class imbalance
dupe=train_features.copy()
dupe['Label']=y_train
dupe=dupe[dupe.Label==1]
dupe=dupe.drop('Label',axis=1)
dupe

In [None]:
train_features_dupe=train_features.copy()
for i in range(10):
    train_features_dupe=pd.concat([train_features_dupe,dupe])
train_features_dupe

In [None]:
y_train_dupe=np.append(y_train,np.ones(len(train_features_dupe)-len(train_features)))
y_train_dupe

In [None]:
X_dupe=StandardScaler().fit_transform(train_features_dupe.values)
## fit the model
rf.fit(X_dupe,y_train_dupe)
xgb.fit(X_dupe,y_train_dupe)
log_reg.fit(X_dupe,y_train_dupe)
log_reg.coef_

In [None]:
##Validation
cutoff = 0.5
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(validate_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Logistic Regression with a cutoff of",cutoff,
      "is", np.sum(y_pred == y_validate)/len(y_validate), "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


y_pred=xgb.predict(StandardScaler().fit_transform(validate_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for XGBoost is", np.sum(y_pred == y_validate)/len(y_validate),  "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()


y_pred=rf.predict(StandardScaler().fit_transform(validate_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Random Forest is", np.sum(y_pred == y_validate)/len(y_validate),  "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {rf.__class__.__name__}')
plt.show()

In [None]:
##Test
cutoff = 0.5
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(test_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Logistic Regression with a cutoff of",cutoff,
      "is", np.sum(y_pred == y_test)/len(y_test), "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


y_pred=xgb.predict(StandardScaler().fit_transform(test_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for XGBoost is", np.sum(y_pred == y_test)/len(y_test),  "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()


y_pred=rf.predict(StandardScaler().fit_transform(test_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Random Forest is", np.sum(y_pred == y_test)/len(y_test),  "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {rf.__class__.__name__}')
plt.show()

In [None]:
##Manually downsampling the the non-fraud data points to deal with class imabalance
features_down=train_features.sample(n=20000,random_state=1)
y_train_down=total_data[0:1000000]['Label'].values[features_down.index]
train_down=pd.concat([features_down,dupe],ignore_index=True)
y_down=np.append(y_train_down,np.ones(len(train_down)-20000))
plt.hist(y_down)

In [None]:
X_down=StandardScaler().fit_transform(train_down.values)
## fit the model
rf.fit(X_down,y_down)
xgb.fit(X_down,y_down)
log_reg.fit(X_down,y_down)
log_reg.coef_

In [None]:
##Validation
cutoff = 0.5
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(validate_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Logistic Regression with a cutoff of",cutoff,
      "is", np.sum(y_pred == y_validate)/len(y_validate), "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


y_pred=xgb.predict(StandardScaler().fit_transform(validate_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for XGBoost is", np.sum(y_pred == y_validate)/len(y_validate),  "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()


y_pred=rf.predict(StandardScaler().fit_transform(validate_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Random Forest is", np.sum(y_pred == y_validate)/len(y_validate),  "and PR AUC is", average_precision_score(y_validate,y_pred))
print(classification_report(y_validate, y_pred))
cm = confusion_matrix(y_validate, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {rf.__class__.__name__}')
plt.show()

In [None]:
##Test
cutoff = 0.5
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(test_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Logistic Regression with a cutoff of",cutoff,
      "is", np.sum(y_pred == y_test)/len(y_test), "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


y_pred=xgb.predict(StandardScaler().fit_transform(test_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for XGBoost is", np.sum(y_pred == y_test)/len(y_test),  "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()


y_pred=rf.predict(StandardScaler().fit_transform(test_features.values))
## print the accuracy
## input the accuracy after "is",
print("The training accuracy for Random Forest is", np.sum(y_pred == y_test)/len(y_test),  "and PR AUC is", average_precision_score(y_test,y_pred))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {rf.__class__.__name__}')
plt.show()

In [None]:
# Doing PCA experiment
# Standardize the data
scaled_data = StandardScaler().fit_transform(train_features)

# Apply PCA with 2 components
pca = PCA(n_components=2)
pca.fit(scaled_data)    

# Transform the data
transformed_data = pca.transform(scaled_data)

print("Original data:\n", train_features)
print("\nScaled data:\n", scaled_data)
print("\nTransformed data (2 principal components):\n", transformed_data)
print("\nExplained variance ratio per component:", pca.explained_variance_ratio_)
print("Total explained variance:", np.sum(pca.explained_variance_ratio_))


In [None]:
# Display the first two PCA components
pca_df = pd.DataFrame(data=transformed_data, columns=['PC1', 'PC2'])
pca_df['target'] = y_train
fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='target',
    title='Interactive 3D PCA of Transaction Dataset'
)
fig.show()

In [None]:
##Experimenting with LDA for seperating the data
lda = LinearDiscriminantAnalysis(n_components=1)
X_lda = lda.fit_transform(train_features, y_train)
print(X_lda.shape)
plt.figure(figsize=(8, 4))
plt.hist(X_lda[y_train == 0], bins=20, alpha=0.7, label='Class 0')
plt.hist(X_lda[y_train == 1], bins=20, alpha=0.7, label='Class 1')
plt.title('LDA Projection onto 1D')
plt.xlabel('Linear Discriminant')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
##Looking at the correlation matrix for our features in our training data
corr_matrix = pd.concat([train_features,total_data[:1000000]['Label']],axis=1).corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()