# Train Test Split

In [None]:
# Load intermediate dataset
data = pd.read_csv("Cleaned data/finalised_data.csv", index_col=None)

In [None]:
# Feature Selection of Numeric Variables
numeric_columns = data.select_dtypes(include=['int64', 'float64'])
# Check Variance of Numeric Variables
variance = numeric_columns.var()
print(variance)

Transaction Amount    73113.509492
Quantity                  2.001479
Customer Age            100.062697
Is Fraudulent             0.047616
Account Age Days      11419.962167
Transaction Hour         48.664510
Age Group                 1.163202
ShipBill                  0.090124
dtype: float64


In [None]:
# Check Variance of Categorical Variables
categorical_columns = data.select_dtypes(include=['object'])
# Count of Categorical features and remove if lacking variance
for column in categorical_columns.columns:
    print(categorical_columns[column].value_counts())

Transaction ID
15d2e414-8735-46fc-9e02-80b472b2580f    1
a81a89d3-15db-498d-b7a2-715233d5c477    1
e7e0aafc-d2f2-418d-9b7f-cc05cb9b2a8b    1
89fbc566-0421-4859-be00-a4803d0ed21a    1
8406043a-d4e0-463e-8ca4-bc3492c87503    1
                                       ..
6d036b10-29b9-46db-8024-f0ddb8c357aa    1
76fbbad6-34ca-4f1f-a503-dd5013ececab    1
59a01e3a-1539-4d17-bd21-adbfcdbeedcd    1
96cdb681-8b01-4611-a1a5-28a4fac98af3    1
d1a811a2-6015-47fa-95e5-0a5282755932    1
Name: count, Length: 1472694, dtype: int64
Customer ID
d1b87f62-51b2-493b-ad6a-77e0fe13e785    1
befaa147-b877-47f3-85a9-1920711f36fe    1
4bdf4788-d8b9-4e8f-bb69-b5a068c59f71    1
f1e7c30f-d719-4dff-ba6a-4f08fe5d62da    1
3407fadb-fd04-4ebe-947f-9e8541bf68eb    1
                                       ..
b587a76d-5830-4b6f-833b-f7958f8d7647    1
7783a7ea-7a52-4c79-893b-2834b19647b9    1
1c94d0c8-34bc-4df2-b263-7b1f232601fd    1
100fa7e8-8b17-4775-aed0-df01a255ac06    1
637d5ed1-a4af-4234-a1d9-a3791b6f1bb6    1
Name: 

In [None]:
# Remove variables if needed
data = data.drop(columns=['Shipping Address', 'Billing Address', 'Transaction ID', 'Customer ID', 'Transaction Date', 'IP Address', 'Customer Location'])

In [None]:
print(data.isnull().sum())

Transaction Amount                  0
Quantity                            0
Customer Age                        0
Is Fraudulent                       0
Account Age Days                    0
Transaction Hour                    0
Age Group                           0
addressesMatch                      0
isPOBox                             0
ShipBill                            0
Payment Method_PayPal               0
Payment Method_bank transfer        0
Payment Method_credit card          0
Payment Method_debit card           0
Product Category_clothing           0
Product Category_electronics        0
Product Category_health & beauty    0
Product Category_home & garden      0
Product Category_toys & games       0
Device Used_desktop                 0
Device Used_mobile                  0
Device Used_tablet                  0
dtype: int64


In [None]:
# Train test split
X = data.loc[:, data.columns != 'Is Fraudulent']
y = data["Is Fraudulent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# Model Creation and Evaluation


In [None]:
from numpy import mean, std
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import precision_recall_curve, auc, make_scorer

# Create AUC eval function
def pr_auc(y_true, probas_pred):
    # calculate precision-recall curve
    p, r, _ = precision_recall_curve(y_true, probas_pred)
    # calculate area under curve
    return auc(r, p)

In [None]:
# Create model evaluation function
def evaluate_model(X, y, model):

    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define the model evaluation the metric
    metric = make_scorer(pr_auc, needs_proba=True)

    # evaluate model
    #scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)

    # Perform cross-validation
    results = cross_validate(model, X, y, scoring=metric, cv=cv, return_train_score=False, return_estimator=True)

    # Extract scores and trained models
    scores = results['test_score']
    trained_model = results['estimator']
    return scores, trained_model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

# Define models
def get_models():
    models, names = list(), list()
    #Logistic
    models.append(SGDClassifier(loss='log_loss', penalty='l2', alpha=0.0001, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, random_state=None, learning_rate='optimal',early_stopping=False, n_iter_no_change=5, class_weight=None, warm_start=False, average=False))
    names.append('Logistic')

    # CART
    models.append(DecisionTreeClassifier())
    names.append('CART')
    # KNN
    steps = [('s',StandardScaler()),('m',KNeighborsClassifier())]
    models.append(Pipeline(steps=steps))
    names.append('KNN')
    # Bagging
    models.append(BaggingClassifier(n_estimators=100))
    names.append('BAG')
    # RF
    models.append(RandomForestClassifier(n_estimators=100))
    names.append('RF')

    # ET
    models.append(ExtraTreesClassifier(n_estimators=100))
    names.append('ET')
    return models, names

In [None]:
# Instantiate models
models, names = get_models()
trained_models, results = list(), list()
# evaluate each model
for i in range(len(models)):
    # Evaluate models
    scores, trained_model = evaluate_model(X, y, models[i])
    results.append(scores)
    trained_models.append((names[i], trained_model))
    # Attain performance results
    print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))



>Logistic 0.170 (0.104)




>CART 0.252 (0.005)




In [None]:
#Evaluation
# Confusion Matrix
# Need to ensure a high recall to not let fraudulent transactions go undetected.
#potentially area under curve