In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
DATA_PATH_FEATURES = '../data/elliptic_txs_features.csv'
DATA_PATH_CLASSES = '../data/elliptic_txs_classes.csv'
df_classes = pd.read_csv(DATA_PATH_CLASSES)
df_features = pd.read_csv(DATA_PATH_FEATURES, header=None)
df_features.rename(columns={0: 'txId', 1: 'time_step'}, inplace=True)
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')
df_clean = df_merged[df_merged['class'] != 'unknown'].copy()
df_clean['class'] = df_clean['class'].map({'1': 1, '2': 0})
print(f"Data Loaded! Total valid transactions: {len(df_clean)}")
X = df_clean.drop(columns=['txId', 'class', 'time_step']) 
y = df_clean['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Training Logistic Regression model on {len(X_train)} transactions...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("\nModel Evaluation")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Licit (0)', 'Fraud (1)']))

Data Loaded! Total valid transactions: 46564
Training Logistic Regression model on 32594 transactions...

Model Evaluation
              precision    recall  f1-score   support

   Licit (0)       0.97      0.99      0.98     12587
   Fraud (1)       0.86      0.76      0.80      1383

    accuracy                           0.96     13970
   macro avg       0.91      0.87      0.89     13970
weighted avg       0.96      0.96      0.96     13970

