In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the dataset
data = pd.read_csv(r"D:\fraudTest.csv")

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 0', 'trans_date_trans_time', 'first', 'last', 'street', 'lat', 'long', 'trans_num', 'unix_time'])

# Separate features and target
X = data.drop(columns=['is_fraud'])
y = data['is_fraud']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create preprocessors for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess data
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Initialize the models
log_reg = LogisticRegression()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()

# Train the models
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Predict and evaluate Logistic Regression
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression")
print(classification_report(y_test, y_pred_log_reg))
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("ROC AUC:", roc_auc_score(y_test, y_pred_log_reg))

# Predict and evaluate Decision Tree
y_pred_decision_tree = decision_tree.predict(X_test)
print("\nDecision Tree")
print(classification_report(y_test, y_pred_decision_tree))
print("Accuracy:", accuracy_score(y_test, y_pred_decision_tree))
print("ROC AUC:", roc_auc_score(y_test, y_pred_decision_tree))

# Predict and evaluate Random Forest
y_pred_random_forest = random_forest.predict(X_test)
print("\nRandom Forest")
print(classification_report(y_test, y_pred_random_forest))
print("Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("ROC AUC:", roc_auc_score(y_test, y_pred_random_forest))


Logistic Regression
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110718
           1       0.45      0.06      0.11       426

    accuracy                           1.00    111144
   macro avg       0.72      0.53      0.55    111144
weighted avg       0.99      1.00      0.99    111144

Accuracy: 0.9961131505074498
ROC AUC: 0.53154111358663

Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110718
           1       0.68      0.62      0.65       426

    accuracy                           1.00    111144
   macro avg       0.84      0.81      0.82    111144
weighted avg       1.00      1.00      1.00    111144

Accuracy: 0.9974087670049665
ROC AUC: 0.8116285276463056

Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110718
           1       0.95      0.45      0.61       426

    accuracy        