In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load datasets
train_iden = pd.read_csv('/content/train_identity.csv.zip')
train_tran = pd.read_csv('/content/train_transaction.csv.zip')
test_iden = pd.read_csv('/content/test_identity.csv.zip')
test_tran = pd.read_csv('/content/test_transaction.csv.zip')

# Merge datasets on TransactionID
train_df = train_tran.merge(train_iden, how='left', on='TransactionID')
test_df = test_tran.merge(test_iden, how='left', on='TransactionID')

# Sample a subset for faster testing
train_df = train_df.sample(n=5000, random_state=42)
test_df = test_df.sample(n=5000, random_state=42)

# Split target and features
X = train_df.drop(columns=['isFraud', 'TransactionID'])
y = train_df['isFraud']

# Separate categorical and numerical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Data Preprocessing for Numerical and Categorical Data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())  # Feature scaling using Standardization
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Handle categorical data
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = RandomForestClassifier(random_state=42)

# Create a pipeline that bundles preprocessing and the model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

# Split data into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Additional classification performance metrics
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Detailed classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Accuracy: 0.9700
Confusion Matrix:
[[966   2]
 [ 28   4]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       968
           1       0.67      0.12      0.21        32

    accuracy                           0.97      1000
   macro avg       0.82      0.56      0.60      1000
weighted avg       0.96      0.97      0.96      1000

