In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [42]:
train_data = pd.read_csv('/content/fraudTrain.csv')
test_data = pd.read_csv('/content/fraudTest.csv')

In [43]:
# Step 3: Combining datasets to ensure consistent preprocessing
combined_data = pd.concat([train_data, test_data], keys=['train', 'test'])

# Step 4: Encoding categorical variables
label_encoders = {}
for column in ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']:
    le = LabelEncoder()
    combined_data[column] = le.fit_transform(combined_data[column].astype(str))
    label_encoders[column] = le


In [44]:
# Step 5: Dropping unnecessary columns
combined_data = combined_data.drop(['trans_date_trans_time', 'dob', 'trans_num'], axis=1)

# Step 6: Separating the combined data back into train and test sets
train_data = combined_data.xs('train')
test_data = combined_data.xs('test')

# Step 7: Ensuring there are no missing values in y_train
y_train = train_data['is_fraud']
y_test = test_data['is_fraud']

In [45]:
# Checking for missing values in y_train and y_test
if y_train.isnull().sum() > 0:
    print("Warning: Missing values found in y_train. Imputing them.")
    y_train.fillna(y_train.mode()[0], inplace=True)

if y_test.isnull().sum() > 0:
    print("Warning: Missing values found in y_test. Imputing them.")
    y_test.fillna(y_test.mode()[0], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train.fillna(y_train.mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test.fillna(y_test.mode()[0], inplace=True)


In [46]:
# Step 8: Spliting train data into X and y
X_train = train_data.drop(['is_fraud'], axis=1)
X_test = test_data.drop(['is_fraud'], axis=1)

# Step 9: Handling missing values in X_train and X_test
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)


In [47]:
# Step 10: Normalizing the feature columns
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 11: Initializing models
log_reg = LogisticRegression(max_iter=1000)
dec_tree = DecisionTreeClassifier()
rand_forest = RandomForestClassifier()

In [48]:
# Step 12: Training models
log_reg.fit(X_train, y_train)
dec_tree.fit(X_train, y_train)
rand_forest.fit(X_train, y_train)

In [49]:
# Step 13: Predicting and evaluate models
models = {'Logistic Regression': log_reg, 'Decision Tree': dec_tree, 'Random Forest': rand_forest}

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_prob))

Model: Logistic Regression
              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00     11668
         1.0       0.00      1.00      0.00        29

    accuracy                           0.00     11697
   macro avg       0.50      0.50      0.00     11697
weighted avg       1.00      0.00      0.00     11697

AUC-ROC: 0.5413952691121015
Model: Decision Tree
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     11668
         1.0       0.46      0.21      0.29        29

    accuracy                           1.00     11697
   macro avg       0.73      0.60      0.64     11697
weighted avg       1.00      1.00      1.00     11697

AUC-ROC: 0.6031483101438654
Model: Random Forest
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     11668
         1.0       0.00      0.00      0.00        29

    accuracy                           1.00     11697
   