In [1]:
!pip install pandas numpy scikit-learn matplotlib seaborn joblib

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

# Read datasets
df1 = pd.read_csv('creditcard_2023.csv')
df2 = pd.read_csv('fraudTrain.csv')

df1.head(), df2.head()

(   id        V1        V2        V3        V4        V5        V6        V7  \
 0   0 -0.260648 -0.469648  2.496266 -0.083724  0.129681  0.732898  0.519014   
 1   1  0.985100 -0.356045  0.558056 -0.429654  0.277140  0.428605  0.406466   
 2   2 -0.260272 -0.949385  1.728538 -0.457986  0.074062  1.419481  0.743511   
 3   3 -0.152152 -0.508959  1.746840 -1.090178  0.249486  1.143312  0.518269   
 4   4 -0.206820 -0.165280  1.527053 -0.448293  0.106125  0.530549  0.658849   
 
          V8        V9  ...       V21       V22       V23       V24       V25  \
 0 -0.130006  0.727159  ... -0.110552  0.217606 -0.134794  0.165959  0.126280   
 1 -0.133118  0.347452  ... -0.194936 -0.605761  0.079469 -0.577395  0.190090   
 2 -0.095576 -0.261297  ... -0.005020  0.702906  0.945045 -1.154666 -0.605564   
 3 -0.065130 -0.205698  ... -0.146927 -0.038212 -0.214048 -1.893131  1.003963   
 4 -0.212660  1.049921  ... -0.106984  0.729727 -0.161666  0.312561 -0.414116   
 
         V26       V27       V

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def preprocess_for_logreg(df):
    df = df.copy()
    
    # Convert dates to datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['dob'] = pd.to_datetime(df['dob'])

    # Create age feature
    df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year

    # Extract time-based features
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day'] = df['trans_date_trans_time'].dt.day
    df['month'] = df['trans_date_trans_time'].dt.month
    df['weekday'] = df['trans_date_trans_time'].dt.weekday

    # Drop high-cardinality or ID-like columns
    df.drop(columns=['trans_date_trans_time', 'dob', 'trans_num', 'cc_num', 'unix_time',
                     'first', 'last', 'street'], inplace=True)

    # One-hot encode categorical variables
    categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # Scale numeric features
    scaler = StandardScaler()
    numeric_cols = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'age', 'hour', 'day', 'month', 'weekday']
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df


In [10]:
def preprocess(df):
    df = df.dropna()
    if 'Time' in df.columns:
        df = df.drop(columns=['Time'])
    if 'Amount' in df.columns:
        df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
    return df

df1 = preprocess(df1)
df2 = preprocess(df2)
df2 = preprocess_for_logreg(df2)

In [13]:
def train_models(df, test_size, label_column='Class', testing_file=''):
    X = df.drop(columns=[label_column])
    y = df[label_column]

    if test_size == 0:
        X_train = X
        y_train = y
        df_test = pd.read_csv(testing_file)
        df_test = preprocess(df_test)
        df_test = preprocess_for_logreg(df_test)
        X_test = df_test.drop(columns=[label_column])
        y_test = df_test[label_column]
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=1000)
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            "model": model,
            "classification_report": classification_report(y_test, y_pred, output_dict=True),
            "roc_auc": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        }

    return results

#results2 = train_models(df2, 0.0, testing_file='fraudTest.csv', label_column='is_fraud')
results1 = train_models(df1, 0.3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
def compare_results(results, dataset_name):
    print(f"Results for {dataset_name}:")
    for model_name, metrics in results.items():
        print(f"Model: {model_name}")
        print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
        print("Classification Report:")
        print(pd.DataFrame(metrics['classification_report']).T)
        print("-" * 50)

compare_results(results1, "Dataset 1")
#compare_results(results2, "Dataset 2")

Results for Dataset 1:
Model: Random Forest
ROC-AUC: 1.0000
Classification Report:
              precision    recall  f1-score        support
0              0.999719  0.999953  0.999836   85295.000000
1              0.999953  0.999719  0.999836   85294.000000
accuracy       0.999836  0.999836  0.999836       0.999836
macro avg      0.999836  0.999836  0.999836  170589.000000
weighted avg   0.999836  0.999836  0.999836  170589.000000
--------------------------------------------------
Model: Logistic Regression
ROC-AUC: 0.9998
Classification Report:
              precision    recall  f1-score        support
0              0.996967  0.998265  0.997616   85295.000000
1              0.998263  0.996963  0.997613   85294.000000
accuracy       0.997614  0.997614  0.997614       0.997614
macro avg      0.997615  0.997614  0.997614  170589.000000
weighted avg   0.997615  0.997614  0.997614  170589.000000
--------------------------------------------------


In [15]:
# Save best models
joblib.dump(results1['Random Forest']['model'], 'model_dataset_rf_1.pkl')
joblib.dump(results1['Logistic Regression']['model'], 'model_dataset_lg_1.pkl')

#joblib.dump(results2['Random Forest']['model'], 'model_dataset_rf_2.pkl')
#joblib.dump(results2['Logistic Regression']['model'], 'model_dataset_lg_2.pkl')

['model_dataset_lg_1.pkl']