##                                   Task 2 - Model Building and Training

## 1. Data Preparation

In [1]:
import pandas as pd

# Load datasets
fraud_data = pd.read_csv(r'C:\Users\MMM\Documents\10 Academy File\KAIM-Week-8-9\data\Fraud_Data.csv')

# Separate features and target for Fraud_Data
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

# Train-Test Split

In [2]:
from sklearn.model_selection import train_test_split

# Train-Test Split for Fraud_Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.3, random_state=42)

# Train-Test Split for Credit Card Data
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(
    X_credit, y_credit, test_size=0.3, random_state=42)


# 2. Model Selection 

# Models to Use
* Logistic Regression
* Decision Tree
* Random Forest
* Gradient Boosting
* Multi-Layer Perceptron (MLP)
* Convolutional Neural Network (CNN)
* Recurrent Neural Network (RNN)
* Long Short-Term Memory (LSTM)

We'll begin by building traditional machine learning models (Logistic Regression, Decision Tree, etc.) and then move to deep learning models (MLP, CNN, RNN, LSTM).

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Dictionary for machine learning models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier()
}


# For deep learning models (CNN, RNN, LSTM), we need TensorFlow/Keras.

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam

# Example of CNN model
def create_cnn_model(input_shape):
    model = Sequential([
        Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Example of LSTM model
def create_lstm_model(input_shape):
    model = Sequential([
        LSTM(64, input_shape=input_shape),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model


# 3. Model Training and Evaluation

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

def preprocess_data(X):
    # Encode categorical features using Label Encoding
    label_encoders = {}
    for column in X.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column].astype(str))
        label_encoders[column] = le
    
    # Impute missing values with the most frequent value
    imputer = SimpleImputer(strategy='most_frequent')
    X = imputer.fit_transform(X)
    
    return X, label_encoders

def train_and_evaluate(models, X_train, X_test, y_train, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        print(f'Model: {name}, Accuracy: {accuracy:.4f}')
        print(classification_report(y_test, predictions))

# Data Preparation for Fraud_Data
# Drop target and unnecessary columns if they exist
columns_to_drop_fraud = ['class', 'signup_time', 'purchase_time']
columns_to_drop_fraud = [col for col in columns_to_drop_fraud if col in fraud_data.columns]

X_fraud = fraud_data.drop(columns=columns_to_drop_fraud, errors='ignore') 
y_fraud = fraud_data['class'] if 'class' in fraud_data.columns else None

# Preprocess the features
X_fraud, fraud_label_encoders = preprocess_data(X_fraud)

# Train-test split for Fraud_Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=42)

# Data Preparation for Credit Card Data
# Drop target and unnecessary columns if they exist
columns_to_drop_credit = ['Class', 'Time']
columns_to_drop_credit = [col for col in columns_to_drop_credit if col in credit_data.columns]

X_credit = credit_data.drop(columns=columns_to_drop_credit, errors='ignore') 
y_credit = credit_data['Class'] if 'Class' in credit_data.columns else None

# Preprocess the features
X_credit, credit_label_encoders = preprocess_data(X_credit)

# Train-test split for Credit Card Data
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.3, random_state=42)

# Define models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Multi-Layer Perceptron (MLP)': MLPClassifier(max_iter=1000)
}

# Training and evaluating on Fraud_Data
if y_fraud is not None:
    print("Results on Fraud_Data:")
    train_and_evaluate(models, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud)
else:
    print("Target column 'class' not found in Fraud_Data.")

# Training and evaluating on Credit Card Data
if y_credit is not None:
    print("Results on Credit Card Data:")
    train_and_evaluate(models, X_train_credit, X_test_credit, y_train_credit, y_test_credit)
else:
    print("Target column 'Class' not found in Credit Card Data.")


Results on Fraud_Data:
Model: Logistic Regression, Accuracy: 0.9070
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     41117
           1       0.00      0.00      0.00      4217

    accuracy                           0.91     45334
   macro avg       0.45      0.50      0.48     45334
weighted avg       0.82      0.91      0.86     45334



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Decision Tree, Accuracy: 0.9040
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     41117
           1       0.49      0.57      0.53      4217

    accuracy                           0.90     45334
   macro avg       0.72      0.76      0.74     45334
weighted avg       0.91      0.90      0.91     45334

Model: Random Forest, Accuracy: 0.9564
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41117
           1       0.98      0.54      0.70      4217

    accuracy                           0.96     45334
   macro avg       0.97      0.77      0.84     45334
weighted avg       0.96      0.96      0.95     45334

Model: Gradient Boosting, Accuracy: 0.9551
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41117
           1       0.95      0.54      0.69      4217

    accuracy                           0.96     45334
   macr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Logistic Regression, Accuracy: 0.9993
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.87      0.64      0.74       136

    accuracy                           1.00     85443
   macro avg       0.93      0.82      0.87     85443
weighted avg       1.00      1.00      1.00     85443

Model: Decision Tree, Accuracy: 0.9992
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.73      0.80      0.76       136

    accuracy                           1.00     85443
   macro avg       0.86      0.90      0.88     85443
weighted avg       1.00      1.00      1.00     85443



KeyboardInterrupt: 