In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from tensorflow.keras import layers, models
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load dataset function
def load_data(file_path):
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
    else:
        raise FileNotFoundError(f"{file_path} not found!")

# Preprocess data function
def preprocess_historic_data(df):
    label_encoder = LabelEncoder()
    df['success_indicator'] = label_encoder.fit_transform(df['success_indicator'])
    df = pd.get_dummies(df, columns=['category', 'main_promotion', 'color'], 
                        prefix=['category', 'promotion', 'color'], drop_first=True)
    return df, label_encoder

# Split data function
def split_data(df):
    X = df.drop(columns=['item_no', 'success_indicator'])
    y = df['success_indicator']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

# Logistic Regression model function
def train_logistic_regression(X_train, y_train):
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    return model

# XGBoost model function
def train_xgboost(X_train, y_train):
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)
    return model

# ANN model function
def create_ann_model(input_shape):
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(input_shape,)))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_ann_model(X_train, y_train):
    model = create_ann_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=0)
    return model

# Evaluation function
def evaluate_model(model, X_test, y_test, model_type="sklearn"):
    if model_type == "sklearn":
        y_pred = model.predict(X_test)
    else:
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
    
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

    # Perform cross-validation
    if model_type == "sklearn":
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_test, y_test, cv=kf, scoring='accuracy')
        print("Cross-Validation Accuracy Scores:", cv_scores)
        print("Mean CV Accuracy:", cv_scores.mean())
    else:
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = []
        for train_index, val_index in kf.split(X_test):
            X_val_train, X_val_test = X_test.iloc[train_index], X_test.iloc[val_index]
            y_val_train, y_val_test = y_test.iloc[train_index], y_test.iloc[val_index]
            val_loss, val_accuracy = model.evaluate(X_val_test, y_val_test, verbose=0)
            cv_scores.append(val_accuracy)
        print("Cross-Validation Accuracy Scores:", cv_scores)
        print("Mean CV Accuracy:", np.mean(cv_scores))

# Main function for model selection
if __name__ == "__main__":
    # Load and preprocess data
    file_path = "DSW_ML_Test/historic.csv"
    df = load_data(file_path)
    df_processed, label_encoder = preprocess_historic_data(df)
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(df_processed)
    
    # Train Logistic Regression model
    print("\n--- Logistic Regression ---")
    logreg_model = train_logistic_regression(X_train, y_train)
    evaluate_model(logreg_model, X_test, y_test, model_type="sklearn")
    
    # Train XGBoost model
    print("\n--- XGBoost ---")
    xgb_model = train_xgboost(X_train, y_train)
    evaluate_model(xgb_model, X_test, y_test, model_type="sklearn")
    
    # Train ANN model
    print("\n--- Artificial Neural Network ---")
    ann_model = train_ann_model(X_train, y_train)
    evaluate_model(ann_model, X_test, y_test, model_type="ann")



--- Logistic Regression ---
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.67      0.73       563
           1       0.84      0.91      0.87      1037

    accuracy                           0.83      1600
   macro avg       0.82      0.79      0.80      1600
weighted avg       0.82      0.83      0.82      1600

Accuracy Score: 0.825625
Cross-Validation Accuracy Scores: [0.815625 0.803125 0.803125 0.840625 0.809375]
Mean CV Accuracy: 0.8143750000000001

--- XGBoost ---
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.73      0.76       563
           1       0.86      0.90      0.88      1037

    accuracy                           0.84      1600
   macro avg       0.83      0.81      0.82      1600
weighted avg       0.84      0.84      0.84      1600

Accuracy Score: 0.8375
Cross-Validation Accuracy Scores: [0.80625  0.784375 0.76875  0.81875  0.803125]
Mean 

## Model Selection Summary

After evaluating three different models — Logistic Regression, XGBoost, and Artificial Neural Network (ANN) — based on their performance in terms of accuracy and cross-validation scores, **Logistic Regression** was chosen as the final model. The key reasons for selecting Logistic Regression are:

1. **Consistency in Performance**: Logistic Regression demonstrated the highest cross-validation mean accuracy (0.8144), which indicates better generalization capability on unseen data compared to the other models.
   
2. **Simplicity**: Logistic Regression is a simpler and more interpretable model compared to XGBoost and ANN, making it a more practical choice for this problem. It also requires less computational power and is easier to implement and debug.

3. **Competitive Accuracy**: Although XGBoost had a slightly higher accuracy on the test set, the difference was marginal. Logistic Regression showed very competitive performance while maintaining its simplicity.

In conclusion, Logistic Regression was selected for its balance between simplicity, accuracy, and consistency in cross-validation scores.
