### Imports

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys
import networkx as nx

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer

from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

### Pre - processing

In [2]:
data = pd.read_csv("transactions.csv")


In [3]:
data = data.drop(["zipcodeOri", "zipMerchant"], axis=1)

In [4]:
data.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,'C1093826151','4','M','M348934600','es_transportation',4.55,0
1,0,'C352968107','2','M','M348934600','es_transportation',39.68,0
2,0,'C2054744914','4','F','M1823072687','es_transportation',26.89,0
3,0,'C1760612790','3','M','M348934600','es_transportation',17.25,0
4,0,'C757503768','5','M','M348934600','es_transportation',35.72,0


In [5]:
from imblearn.over_sampling import SMOTE

In [11]:
cols_to_change = ['customer', 'age', 'gender', 'merchant', 'category']
data[cols_to_change] = data[cols_to_change].applymap(lambda x: x.replace("'", ""))

# perform label encoding
for col in ['gender', 'age', 'category', 'customer', 'merchant']:
    encoder = LabelEncoder()
    data[col] = encoder.fit_transform(data[col])

# scale the amount
data['amount'] = StandardScaler().fit_transform(np.array(data['amount']).reshape(-1, 1))


In [13]:
X = data.drop(['fraud'],axis=1)
y = data['fraud']

In [14]:
sm = SMOTE(random_state = 42)

In [15]:
X_res, y_res = sm.fit_resample(X, y)
y_res = pd.DataFrame(y_res)

In [17]:
data.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,210,4,2,30,12,-0.299276,0
1,0,2753,2,2,30,12,0.016067,0
2,0,2285,4,1,18,12,-0.098742,0
3,0,1650,3,2,30,12,-0.185275,0
4,0,3585,5,2,30,12,-0.01948,0


In [20]:
print(data)

        step  customer  age  gender  merchant  category    amount  fraud
0          0       210    4       2        30        12 -0.299276      0
1          0      2753    2       2        30        12  0.016067      0
2          0      2285    4       1        18        12 -0.098742      0
3          0      1650    3       2        30        12 -0.185275      0
4          0      3585    5       2        30        12 -0.019480      0
...      ...       ...  ...     ...       ...       ...       ...    ...
594638   179      1639    3       1        18        12 -0.155832      0
594639   179      3369    4       1        18        12  0.115256      0
594640   179       529    2       1        31         2 -0.138687      0
594641   179      1083    5       2        18        12 -0.210319      0
594642   179      3304    4       1        18        12 -0.098383      0

[594643 rows x 8 columns]


### Train

In [26]:
X = data.drop(['fraud'],axis=1)
Y = data['fraud']

In [28]:
print(X)
print(Y)

        step  customer  age  gender  merchant  category    amount
0          0       210    4       2        30        12 -0.299276
1          0      2753    2       2        30        12  0.016067
2          0      2285    4       1        18        12 -0.098742
3          0      1650    3       2        30        12 -0.185275
4          0      3585    5       2        30        12 -0.019480
...      ...       ...  ...     ...       ...       ...       ...
594638   179      1639    3       1        18        12 -0.155832
594639   179      3369    4       1        18        12  0.115256
594640   179       529    2       1        31         2 -0.138687
594641   179      1083    5       2        18        12 -0.210319
594642   179      3304    4       1        18        12 -0.098383

[594643 rows x 7 columns]
0         0
1         0
2         0
3         0
4         0
         ..
594638    0
594639    0
594640    0
594641    0
594642    0
Name: fraud, Length: 594643, dtype: int64


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [35]:
# Creating a graph from the training dataset
G = nx.from_pandas_edgelist(X_train, 'customer', 'merchant', create_using=nx.Graph())

# Calculate graph metrics
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)


# Adding graph features to the training and test datasets
X_train['degree_centrality'] = X_train['customer'].map(degree_centrality).fillna(0)
X_train['closeness_centrality'] = X_train['customer'].map(closeness_centrality).fillna(0)
X_train['betweenness_centrality'] = X_train['customer'].map(betweenness_centrality).fillna(0)

X_test['degree_centrality'] = X_test['customer'].map(degree_centrality).fillna(0)
X_test['closeness_centrality'] = X_test['customer'].map(closeness_centrality).fillna(0)
X_test['betweenness_centrality'] = X_test['customer'].map(betweenness_centrality).fillna(0)


shortest_path will return an iterator that yields
(node, path) pairs instead of a dictionary when source
and target are unspecified beginning in version 3.5

To keep the current behavior, use:

	dict(nx.shortest_path(G))


KeyboardInterrupt: 

In [None]:
#add oversampling
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)


In [None]:
#gb_clf = GradientBoostingClassifier(n_estimators=100, max_depth=8, random_state=10, learning_rate=0.1, verbose=1)
#gb_clf.fit(X_res, y_res)
#y_pred = gb_clf.predict(X_test)

scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),'recall':make_scorer(recall_score)}

parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10, 70, 80]
    }

clf = GridSearchCV(GradientBoostingClassifier(), parameters,scoring=scoring,refit=False,cv=2, n_jobs=-1)
clf.fit(X_res, y_res)

#converting the clf.cv_results to dataframe
df=pd.DataFrame.from_dict(clf.cv_results_)

#here Possible inputs for cross validation is cv=2, there two split split0 and split1
df[['split0_test_accuracy','split1_test_accuracy','split0_test_precision','split1_test_precision','split0_test_recall','split1_test_recall']]

print(df)


#print("Classification Report for Random Forest Classifier: \n", classification_report(y_test, y_pred))
#print("Confusion Matrix of Random Forest Classifier: \n", confusion_matrix(y_test, y_pred))

## Evaluate