In [5]:
pip install pyod

Collecting pyod
  Downloading pyod-2.0.3.tar.gz (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.6/169.6 kB[0m [31m534.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25ldone
[?25h  Created wheel for pyod: filename=pyod-2.0.3-py3-none-any.whl size=200476 sha256=14a74bbf9edcc5c944d310571c7374bb4a10e49a6fb22bfd6bd92c7f3a4a9c37
  Stored in directory: /Users/christopher/Library/Caches/pip/wheels/16/7c/ed/ebc8a4a1a530f7e955ed494a8c2085079f0ef34e3bc0735c97
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from pyod.models.iforest import IForest

In [9]:
df = pd.read_csv('data/fe_data_with_datateam_thresholds.csv')

In [11]:
X = df.drop(columns=['temp_fraud'], axis=1)
X.set_index('transaction_reference', inplace=True)
y = df['temp_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### logisitic regression

In [14]:
sample_weights = X_train['confidence_score']

In [20]:
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train) # add "sample_weight=sample_weights" after y_train

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
y_pred = logmodel.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89     52245
           1       0.90      0.60      0.72     27124

    accuracy                           0.84     79369
   macro avg       0.86      0.78      0.80     79369
weighted avg       0.85      0.84      0.83     79369



In [24]:
pd.DataFrame(confusion_matrix(y_test, y_pred), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,50455,1790
Actual: True,10898,16226


### random forest

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42) # assign class_weights as 1:2 ratio
rf_model.fit(X_train, y_train)

In [40]:
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     52245
           1       1.00      1.00      1.00     27124

    accuracy                           1.00     79369
   macro avg       1.00      1.00      1.00     79369
weighted avg       1.00      1.00      1.00     79369



In [42]:
pd.DataFrame(confusion_matrix(y_test, y_pred), 
             index = ['Actual: False', 'Actual: True'],
             columns = ['Pred: False', 'Pred: True'])

Unnamed: 0,Pred: False,Pred: True
Actual: False,52245,0
Actual: True,0,27124


### XGBoost

In [44]:
xgbmodel = xgb.XGBClassifier(
    objective='binary:logistic',  
    n_estimators=100,             # Number of boosting rounds
    learning_rate=0.1,            # Step size shrinkage
    max_depth=3,                  
    random_state=42,
    use_label_encoder=False,      
    eval_metric='logloss'          # Evaluation metric
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xgbmodel.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [46]:
y_pred = xgbmodel.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     52245
           1       1.00      1.00      1.00     27124

    accuracy                           1.00     79369
   macro avg       1.00      1.00      1.00     79369
weighted avg       1.00      1.00      1.00     79369



### iForest

In [57]:
from sklearn.ensemble import IsolationForest

In [59]:
iso_forest = IsolationForest(n_estimators=100, contamination=0.2)
iso_forest.fit(X_train)

In [61]:
# predicting anomalies (1 for inliers, -1 for outliers)
predictions = iso_forest.predict(X_train)

# calculates anomaly scores (lower scores indicate more anomalous)
scores = iso_forest.decision_function(X_train)

In [63]:
print("Predictions:", predictions)
print("Anomaly scores:", scores)

Predictions: [ 1  1  1 ... -1  1  1]
Anomaly scores: [ 0.11556207  0.07398128  0.08297943 ... -0.00430782  0.04880406
  0.05822262]


In [67]:
# Manually label some data as outliers for verification (replace with your ground truth)
manual_outlier_indices = [6,7,8] #indices of outliers in data array.
manual_outlier_labels = np.zeros(len(X_train))
manual_outlier_labels[manual_outlier_indices] = 1 #1 is outlier, 0 is inlier.

num_outliers = np.sum(predictions == -1)
print(f"Number of outliers identified: {num_outliers}")

Number of outliers identified: 63495


In [None]:
# Convert Isolation Forest predictions to binary labels (1 for outliers, 0 for inliers)
binary_predictions = np.where(predictions == -1, 1, 0)

# Verify with actual data (if you have ground truth labels)
if np.any(manual_outlier_labels): #only run if manual labels exist.
    cm = confusion_matrix(manual_outlier_labels, binary_predictions)
    TN, FP, FN, TP = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)
    print(f"True Positives (TP): {TP}")
    print(f"True Negatives (TN): {TN}")
    print(f"False Positives (FP): {FP}")
    print(f"False Negatives (FN): {FN}")

    #Calculate additional metrics:
    accuracy = (TP+TN) / (TP+TN+FP+FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")

else:
    print("\nManual outlier labels not provided. Cannot compute confusion matrix or metrics.")


In [69]:
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42) #Contamination matches the expected outlier ratio.
iso_forest.fit(X_train)

# Predict anomalies on X_test
predictions = iso_forest.predict(X_test)

# Count the number of outliers identified in X_test
num_outliers = np.sum(predictions == -1)
print(f"Number of outliers identified in X_test: {num_outliers}")

Number of outliers identified in X_test: 4061


In [71]:
# Convert Isolation Forest predictions to binary labels (1 for outliers, 0 for inliers)
binary_predictions = np.where(predictions == -1, 1, 0)

# Verify with actual y_test data
cm = confusion_matrix(y_test, binary_predictions)
TN, FP, FN, TP = cm.ravel()

print("\nConfusion Matrix:")
print(cm)
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")


Confusion Matrix:
[[52177    68]
 [23131  3993]]
True Positives (TP): 3993
True Negatives (TN): 52177
False Positives (FP): 68
False Negatives (FN): 23131


In [73]:
# classification report:
accuracy = (TP+TN) / (TP+TN+FP+FN)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.70770703927226
Precision: 0.9832553558236887
Recall: 0.1472128004719068
F1-score: 0.2560846560846561
