In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('../../src/data/e-commerce_processed_data.csv')

# Feature and Target Separation
X = data.drop(columns=['class'])
y = data['class']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Define a function to train and log models with MLflow
def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test):
    with mlflow.start_run():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Log parameters, metrics, and model
        mlflow.log_param("model_name", model_name)
        mlflow.log_metrics({
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred)
        })
        mlflow.sklearn.log_model(model, model_name)

# Train and log models with MLflow
train_and_log_model(LogisticRegression(max_iter=1000, random_state=42), "Logistic Regression", X_train, y_train, X_test, y_test)
train_and_log_model(DecisionTreeClassifier(random_state=42), "Decision Tree", X_train, y_train, X_test, y_test)
train_and_log_model(RandomForestClassifier(random_state=42), "Random Forest", X_train, y_train, X_test, y_test)
train_and_log_model(GradientBoostingClassifier(random_state=42), "Gradient Boosting", X_train, y_train, X_test, y_test)
train_and_log_model(MLPClassifier(random_state=42), "Multi-Layer Perceptron (MLP)", X_train, y_train, X_test, y_test)

# Initialize and train the models
lr_model = LogisticRegression(max_iter=1000, random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
mlp_model = MLPClassifier(random_state=42)

# Train the models
lr_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
mlp_model.fit(X_train, y_train)

# Predict and evaluate the models
models = {
    "Logistic Regression": lr_model,
    "Decision Tree": dt_model,
    "Random Forest": rf_model,
    "Gradient Boosting": gb_model,
    "Multi-Layer Perceptron": mlp_model
}

evaluation_results = []

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    evaluation_results.append({
        "Model": model_name,
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-Score": report["weighted avg"]["f1-score"]
    })

# Create a DataFrame for evaluation results
evaluation_df = pd.DataFrame(evaluation_results)

# Display the evaluation results in tabular form
print(evaluation_df)

X_train shape: (120036, 11)
X_test shape: (30009, 11)
y_train shape: (120036,)
y_test shape: (30009,)




                    Model  Precision    Recall  F1-Score
0     Logistic Regression   0.958261  0.956246  0.950068
1           Decision Tree   0.911056  0.904495  0.907479
2           Random Forest   0.958261  0.956246  0.950068
3       Gradient Boosting   0.958261  0.956246  0.950068
4  Multi-Layer Perceptron   0.958261  0.956246  0.950068


In [5]:
# Display the tables
print("\nEvaluation Results:")
print(evaluation_df.to_string(index=False))


Evaluation Results:
                 Model  Precision   Recall  F1-Score
   Logistic Regression   0.958261 0.956246  0.950068
         Decision Tree   0.911056 0.904495  0.907479
         Random Forest   0.958261 0.956246  0.950068
     Gradient Boosting   0.958261 0.956246  0.950068
Multi-Layer Perceptron   0.958261 0.956246  0.950068


Import Necessary Libraries for Neural Networks

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, LSTM, SimpleRNN
from tensorflow.keras.utils import to_categorical

Data Preparation for Neural Networks

In [6]:
# Normalize the features
X_train = X_train / X_train.max()
X_test = X_test / X_test.max()

# One-hot encode the target
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

Convolutional Neural Network (CNN)

In [None]:
# Build the CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

# Compile the model
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train.values.reshape(-1, X_train.shape[1], 1), y_train, epochs=10, batch_size=32, validation_data=(X_test.values.reshape(-1, X_test.shape[1], 1), y_test))

# Evaluate the model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test.values.reshape(-1, X_test.shape[1], 1), y_test)
print(f"CNN Accuracy: {cnn_accuracy}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 0.9065 - loss: 0.4333 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 2/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - accuracy: 0.9058 - loss: 0.3122 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 3/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9067 - loss: 0.3100 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 4/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.9059 - loss: 0.3120 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 5/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.9056 - loss: 0.3126 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 6/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.9061 - loss: 0.3115 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 7/10

Recurrent Neural Network (RNN)

In [8]:
# Build the RNN model
rnn_model = Sequential([
    SimpleRNN(64, input_shape=(X_train.shape[1], 1), activation='relu'),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

# Compile the model
rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
rnn_model.fit(X_train.values.reshape(-1, X_train.shape[1], 1), y_train, epochs=10, batch_size=32, validation_data=(X_test.values.reshape(-1, X_test.shape[1], 1), y_test))

# Evaluate the model
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test.values.reshape(-1, X_test.shape[1], 1), y_test)
print(f"RNN Accuracy: {rnn_accuracy}")

  super().__init__(**kwargs)


Epoch 1/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - accuracy: 0.9072 - loss: 0.4331 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 2/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - accuracy: 0.9070 - loss: 0.3094 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 3/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9067 - loss: 0.3101 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 4/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9079 - loss: 0.3075 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 5/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - accuracy: 0.9063 - loss: 0.3110 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 6/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9054 - loss: 0.3131 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 7/10

Long Short-Term Memory (LSTM)

In [9]:
# Build the LSTM model
lstm_model = Sequential([
    LSTM(64, input_shape=(X_train.shape[1], 1), activation='relu'),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

# Compile the model
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train.values.reshape(-1, X_train.shape[1], 1), y_train, epochs=10, batch_size=32, validation_data=(X_test.values.reshape(-1, X_test.shape[1], 1), y_test))

# Evaluate the model
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test.values.reshape(-1, X_test.shape[1], 1), y_test)
print(f"LSTM Accuracy: {lstm_accuracy}")

Epoch 1/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 7ms/step - accuracy: 0.9059 - loss: 0.4341 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 2/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 7ms/step - accuracy: 0.9060 - loss: 0.3117 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 3/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 7ms/step - accuracy: 0.9065 - loss: 0.3105 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 4/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 7ms/step - accuracy: 0.9052 - loss: 0.3136 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 5/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.9058 - loss: 0.3121 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 6/10
[1m3752/3752[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 7ms/step - accuracy: 0.9076 - loss: 0.3081 - val_accuracy: 0.9066 - val_loss: 0.3103
Epoch 7/10

Collect and Display Evaluation Results in Tabular Form

In [10]:
# Collect evaluation results for neural networks
nn_evaluation_results = [
    {"Model": "CNN", "Accuracy": cnn_accuracy},
    {"Model": "RNN", "Accuracy": rnn_accuracy},
    {"Model": "LSTM", "Accuracy": lstm_accuracy}
]

# Create a DataFrame for evaluation results
nn_evaluation_df = pd.DataFrame(nn_evaluation_results)

# Combine evaluation results from all models
all_evaluation_df = pd.concat([evaluation_df, nn_evaluation_df], ignore_index=True)

# Display the combined evaluation results in tabular form
print("\nEvaluation Results for All Models:")
print(all_evaluation_df.to_string(index=False))


Evaluation Results for All Models:
                 Model  Precision   Recall  F1-Score  Accuracy
   Logistic Regression   0.958261 0.956246  0.950068       NaN
         Decision Tree   0.911056 0.904495  0.907479       NaN
         Random Forest   0.958261 0.956246  0.950068       NaN
     Gradient Boosting   0.958261 0.956246  0.950068       NaN
Multi-Layer Perceptron   0.958261 0.956246  0.950068       NaN
                   CNN        NaN      NaN       NaN  0.906628
                   RNN        NaN      NaN       NaN  0.906628
                  LSTM        NaN      NaN       NaN  0.906628
