In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import pickle

In [2]:
transactions = pd.read_csv("../data/transactions_train.csv")
transactions.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


In [3]:
X = transactions.drop(['isFraud',"step", "nameOrig", "nameDest"], axis=1)
Y = transactions["isFraud"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

### Encode transaction types

In [4]:
type_encoder = LabelEncoder()
type_names = X_train["type"].unique()
type_encoder.fit(type_names)
type_encoder.classes_ = np.append(type_encoder.classes_, "<unknown>")

### Transform categorical features

In [5]:
X_train["type"] = type_encoder.transform(X_train["type"])

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

### Train models

In [7]:
model = MLPClassifier(hidden_layer_sizes=(8,16,16,8), max_iter=20, alpha=1e-5, solver='adam', verbose=1)
model = model.fit(X_train, y_train)

Iteration 1, loss = 0.00481587
Iteration 2, loss = 0.00270293
Iteration 3, loss = 0.00251771
Iteration 4, loss = 0.00241294
Iteration 5, loss = 0.00237966
Iteration 6, loss = 0.00230319
Iteration 7, loss = 0.00225011
Iteration 8, loss = 0.00221838
Iteration 9, loss = 0.00216235
Iteration 10, loss = 0.00212774
Iteration 11, loss = 0.00208365
Iteration 12, loss = 0.00205291
Iteration 13, loss = 0.00207433
Iteration 14, loss = 0.00203197
Iteration 15, loss = 0.00201852
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


### Evaluate

In [8]:
X_test.loc[~X_test.type.isin(type_encoder.classes_), 'type'] = "<unknown>"
X_test["type"] = type_encoder.transform(X_test["type"])

In [9]:
X_test = scaler.transform(X_test)

In [10]:
predictions = model.predict(X_test)

f1score = f1_score(y_test, predictions).round(4)
accuracy = accuracy_score(y_test, predictions).round(4)
cm = confusion_matrix(y_test,predictions)

print("F1Score: {}".format(f1score))
print("Accuracy: {}".format(accuracy))
print("Confusion Matrix:\n{}".format(cm))
print()

F1Score: 0.8202
Accuracy: 0.9996
Confusion Matrix:
[[1268602      62]
 [    437    1138]]



### Store Artifacts

In [11]:
def dump_artifact(artifact, artifact_path):
    with open(artifact_path, 'wb') as handle:
        pickle.dump(artifact, handle, protocol=pickle.HIGHEST_PROTOCOL)

artifacts = {
    "type_encoder": type_encoder,
    "model": model,
    "scaler": scaler
}
dump_artifact(artifacts, "../artifacts/artifacts.pkl")