In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from scikeras.wrappers import KerasClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [16]:
historic_df = pd.read_csv(r"DSW_ML_Test/historic.csv")
prediction_df = pd.read_csv(r"DSW_ML_Test/prediction_input.csv")

# Check for missing values in both datasets
historic_missing = historic_df.isnull().sum()
prediction_missing = prediction_df.isnull().sum()

# Summary statistics for the numerical column 'stars'
historic_stats = historic_df['stars'].describe()

In [7]:
# Preprocessing steps for the historic data with One-Hot Encoding and Label Encoding
# Preprocessing steps for the historic data with One-Hot Encoding and Label Encoding
def preprocess_historic_data(df):
    # Encode the 'success_indicator' label (top = 1, flop = 0) using LabelEncoder
    label_encoder = LabelEncoder()
    df['success_indicator'] = label_encoder.fit_transform(df['success_indicator'])
    
    # One-Hot Encode the categorical columns ('category', 'main_promotion', 'color')
    df = pd.get_dummies(df, columns=['category', 'main_promotion', 'color'], prefix=['category', 'promotion', 'color'], drop_first=True)
    
    return df, label_encoder

# Apply preprocessing to historic data
historic_df_processed, label_encoder = preprocess_historic_data(historic_df)

# Separate features (X) and target (y)
X = historic_df_processed.drop(columns=['item_no', 'success_indicator'])
y = historic_df_processed['success_indicator']

# Split data into training and testing sets (80% training, 20% testing)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of the training and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6400, 18), (1600, 18), (6400,), (1600,))

In [8]:
# Create a Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the model performance
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
logreg_report = classification_report(y_test, y_pred_logreg, target_names=['flop', 'top'])

In [17]:
logreg_accuracy , logreg_report

(0.825625,
 '              precision    recall  f1-score   support\n\n        flop       0.80      0.67      0.73       563\n         top       0.84      0.91      0.87      1037\n\n    accuracy                           0.83      1600\n   macro avg       0.82      0.79      0.80      1600\nweighted avg       0.82      0.83      0.82      1600\n')

In [10]:
logreg_report

'              precision    recall  f1-score   support\n\n        flop       0.80      0.67      0.73       563\n         top       0.84      0.91      0.87      1037\n\n    accuracy                           0.83      1600\n   macro avg       0.82      0.79      0.80      1600\nweighted avg       0.82      0.83      0.82      1600\n'

In [11]:
X_train

Unnamed: 0,stars,category_Hoodie,category_Polo-Shirt,category_Sweatshirt,category_T-Shirt,category_Tunic,promotion_Category_Highlight,promotion_Display_Ad_Campaign,promotion_Frontpage_Header,color_Blue,color_Brown,color_Green,color_Multi-Color,color_Orange,color_Pink,color_Red,color_White,color_Yellow
7485,1.9,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False
7826,4.5,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False
4681,4.6,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False
4203,2.5,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True
5097,3.3,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1237,3.6,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False
601,4.8,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False
858,4.3,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False
6883,4.7,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False


In [12]:
# Create a Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the model performance
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
logreg_report = classification_report(y_test, y_pred_logreg, target_names=['flop', 'top'])

# Print results
print(f'Logistic Regression Accuracy: {logreg_accuracy}')
print(f'Classification Report:\n{logreg_report}')

Logistic Regression Accuracy: 0.825625
Classification Report:
              precision    recall  f1-score   support

        flop       0.80      0.67      0.73       563
         top       0.84      0.91      0.87      1037

    accuracy                           0.83      1600
   macro avg       0.82      0.79      0.80      1600
weighted avg       0.82      0.83      0.82      1600



In [15]:
# Apply K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-Fold Cross Validation

# Use cross_val_score to evaluate the model's accuracy across different folds
logreg_cv_scores = cross_val_score(logreg_model, X_train, y_train, cv=kf, scoring='accuracy')

# Print Cross Validation results
print(f"Logistic Regression Cross-Validation Scores: {logreg_cv_scores}")
print(f"Mean CV Accuracy: {logreg_cv_scores.mean()}")
print(f"Standard Deviation of CV Accuracy: {logreg_cv_scores.std()}")

Logistic Regression Cross-Validation Scores: [0.80546875 0.8171875  0.83515625 0.81484375 0.81484375]
Mean CV Accuracy: 0.8175000000000001
Standard Deviation of CV Accuracy: 0.009702609185162544


In [24]:
# Create an XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the XGBoost model on the training set
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the XGBoost model performance on the test set
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_report = classification_report(y_test, y_pred_xgb, target_names=['flop', 'top'])

# Print results from the test set
print(f'XGBoost Accuracy: {xgb_accuracy}')
print(f'Classification Report:\n{xgb_report}')

# Apply K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-Fold Cross Validation

# Use cross_val_score to evaluate the model's accuracy across different folds
xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=kf, scoring='accuracy')

# Print Cross Validation results
print(f"XGBoost Cross-Validation Scores: {xgb_cv_scores}")
print(f"Mean CV Accuracy: {xgb_cv_scores.mean()}")
print(f"Standard Deviation of CV Accuracy: {xgb_cv_scores.std()}")


XGBoost Accuracy: 0.8375
Classification Report:
              precision    recall  f1-score   support

        flop       0.79      0.73      0.76       563
         top       0.86      0.90      0.88      1037

    accuracy                           0.84      1600
   macro avg       0.83      0.81      0.82      1600
weighted avg       0.84      0.84      0.84      1600

XGBoost Cross-Validation Scores: [0.83671875 0.84296875 0.85078125 0.84453125 0.83359375]
Mean CV Accuracy: 0.8417187500000001
Standard Deviation of CV Accuracy: 0.0060434623768167986


In [34]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Create a simple ANN model
def create_ann_model(input_shape):
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(input_shape,)))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))  # Binary classification

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define the ANN model
ann_model = create_ann_model(X_train.shape[1])

# Train the model
ann_history = ann_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
ann_loss, ann_accuracy = ann_model.evaluate(X_test, y_test)
print(f'ANN Test Accuracy: {ann_accuracy}')

import numpy as np

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for train_index, val_index in kf.split(X_train):
    # Ensure you use .iloc for DataFrame indexing
    X_train_kf, X_val_kf = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model_kf = create_ann_model(X_train.shape[1])
    model_kf.fit(X_train_kf, y_train_kf, epochs=20, batch_size=32, verbose=0)
    
    # Evaluate the model on the validation set
    val_loss, val_accuracy = model_kf.evaluate(X_val_kf, y_val_kf, verbose=0)
    cv_scores.append(val_accuracy)

print(f"K-Fold Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {np.mean(cv_scores)}")
print(f"Standard Deviation of CV Accuracy: {np.std(cv_scores)}")


# Hyperparameter tuning with GridSearchCV (using a wrapper for Keras)


def create_model(optimizer='adam'):
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=32, verbose=0)

# Define the grid of hyperparameters to search
param_grid = {
    'optimizer': ['adam', 'sgd'],  
    'batch_size': [16, 32, 64],    
}

# Perform Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# Summarize the results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']

for mean, param in zip(means, params):
    print(f"{mean} with: {param}")


Epoch 1/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6127 - loss: 0.6241 - val_accuracy: 0.7977 - val_loss: 0.4746
Epoch 2/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8195 - loss: 0.4569 - val_accuracy: 0.8281 - val_loss: 0.4484
Epoch 3/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8319 - loss: 0.4323 - val_accuracy: 0.8391 - val_loss: 0.4345
Epoch 4/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8370 - loss: 0.4265 - val_accuracy: 0.8391 - val_loss: 0.4325
Epoch 5/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8409 - loss: 0.4152 - val_accuracy: 0.8438 - val_loss: 0.4276
Epoch 6/20
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8513 - loss: 0.4124 - val_accuracy: 0.8469 - val_loss: 0.4267
Epoch 7/20
[1m160/160[0m 