<a href="https://colab.research.google.com/github/AkramAzid1/Adidas-Sales-Classification/blob/main/FinalTest_AdidasSale.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

UPLOAD FILE

In [None]:
import pandas as pd
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  df = pd.read_excel(fn)

Saving updated_dataset.xlsx to updated_dataset.xlsx


DATA PREVIEW

In [None]:
print("Full Dataset Preview:")
print(df.head(15))

print("\n Product Category Counts:")
print(df['Product'].value_counts())

print("\n Retailer Counts:")
print(df['Retailer'].value_counts())


Full Dataset Preview:
         Retailer  Retailer ID Invoice Date        State            City  \
0         Walmart      1128299   2021-06-17      Florida         Orlando   
1       West Gear      1128299   2021-07-16    Louisiana     New Orleans   
2   Sports Direct      1197831   2021-08-25      Alabama      Birmingham   
3   Sports Direct      1197831   2021-08-27      Alabama      Birmingham   
4   Sports Direct      1197831   2021-08-21      Alabama      Birmingham   
5       West Gear      1185732   2021-01-11     Missouri       St. Louis   
6   Sports Direct      1185732   2021-11-17  Mississippi         Jackson   
7   Sports Direct      1197831   2021-01-23  Mississippi         Jackson   
8       West Gear      1185732   2021-04-03    Wisconsin       Milwaukee   
9     Foot Locker      1185732   2020-03-10     New York        New York   
10    Foot Locker      1185732   2021-01-18     Michigan         Detroit   
11      West Gear      1185732   2021-02-11     Missouri       St.

DATA NORMALIZE

In [None]:
# 1️⃣ Drop rows with any missing values
df = df.dropna()

# 2️⃣ Optional: Remove duplicates if any
df = df.drop_duplicates()

# 3️⃣ Reset index after cleaning (not mandatory, but cleaner)
df = df.reset_index(drop=True)

# 4️⃣ Check if any columns still have NaN (just to be sure)
print("Any missing values left?")
print(df.isnull().sum())

Any missing values left?
Retailer          0
Retailer ID       0
Invoice Date      0
State             0
City              0
Product           0
Price per Unit    0
Units Sold        0
Total Sales       0
SalesClass        0
dtype: int64


SPLIT TARGET FEATURE

In [None]:
# Features can include Product type, Retailer, Price per unit, Units sold
# First, encode 'Product' and 'Retailer' as numbers
from sklearn.preprocessing import LabelEncoder

le_product = LabelEncoder()
df['Product_encoded'] = le_product.fit_transform(df['Product'])

le_retailer = LabelEncoder()
df['Retailer_encoded'] = le_retailer.fit_transform(df['Retailer'])

# Now define features and target
X = df[['Product_encoded', 'Retailer_encoded', 'Price per Unit', 'Units Sold']]
y = df['SalesClass']


SPLIT TEST TRAIN

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


MODEL

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

from sklearn.naive_bayes import GaussianNB

# Initialize and train Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Scale the data first
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train SVM
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')  # 'rbf' is the default and handles non-linearity
svm_model.fit(X_train_scaled, y_train)


TEST EVALUATE

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict using test data
y_pred_dt = dt_model.predict(X_test)

# Evaluate
print("Decision Tree Results")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

# Predict using test data
y_pred_nb = nb_model.predict(X_test)

# Evaluate
print("Naive Bayes Results")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

# Predict using test data (SVM requires scaled test set)
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate
print("Support Vector Machine (SVM) Results")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


Decision Tree Results
Accuracy: 0.9744121715076072
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      1446
           1       0.97      0.98      0.97      1446

    accuracy                           0.97      2892
   macro avg       0.97      0.97      0.97      2892
weighted avg       0.97      0.97      0.97      2892

Confusion Matrix:
 [[1400   46]
 [  28 1418]]
Naive Bayes Results
Accuracy: 0.8896957123098201
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.97      0.90      1446
           1       0.96      0.81      0.88      1446

    accuracy                           0.89      2892
   macro avg       0.90      0.89      0.89      2892
weighted avg       0.90      0.89      0.89      2892

Confusion Matrix:
 [[1401   45]
 [ 274 1172]]
Support Vector Machine (SVM) Results
Accuracy: 0.9315352697095436
Classification Report:
               pr

OPTIMIZATION

In [None]:
# ---- Decision Tree Tuning ----
from sklearn.model_selection import GridSearchCV

dt_params = {
    'criterion': ['gini', 'entropy'],  # or 'log_loss' for newer sklearn
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, cv=5)
dt_grid.fit(X_train, y_train)

print("Best Decision Tree Params:", dt_grid.best_params_)
print("Best Decision Tree Score:", dt_grid.best_score_)


# ---- Naive Bayes Tuning ----
from sklearn.naive_bayes import GaussianNB
import numpy as np

nb_params = {
    'var_smoothing': np.logspace(-9, -6, 5)
}

nb_grid = GridSearchCV(GaussianNB(), nb_params, cv=5)
nb_grid.fit(X_train, y_train)

print("Best Naive Bayes Params:", nb_grid.best_params_)
print("Best Naive Bayes Score:", nb_grid.best_score_)


# ---- SVM Tuning ----
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# scale first
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm_grid = GridSearchCV(SVC(), svm_params, cv=5)
svm_grid.fit(X_train_scaled, y_train)

print("Best SVM Params:", svm_grid.best_params_)
print("Best SVM Score:", svm_grid.best_score_)


Best Decision Tree Params: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Decision Tree Score: 0.9746555747741812
Best Naive Bayes Params: {'var_smoothing': np.float64(1e-06)}
Best Naive Bayes Score: 0.8953598550366527
Best SVM Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best SVM Score: 0.9374536968399088
