In [3]:
#import libraries 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load cleaned data
df = pd.read_csv('amazon_sales_cleaned.csv')
df['order_date'] = pd.to_datetime(df['order_date'])



In [4]:
# 1. Feature Engineering

# Temporal features already in data: year, month, day, day_of_week
# Encode categorical variables: product_category, customer_region, payment_method
le_cat = LabelEncoder()
df['cat_encoded'] = le_cat.fit_transform(df['product_category'])

le_region = LabelEncoder()
df['region_encoded'] = le_region.fit_transform(df['customer_region'])

le_payment = LabelEncoder()
df['payment_encoded'] = le_payment.fit_transform(df['payment_method'])

# Drop original categorical columns after encoding (keep for reference maybe)
# Also drop order_id, product_id (unless useful, but high cardinality)
feature_cols = ['price', 'discount_percent', 'quantity_sold', 'rating', 'review_count',
                'year', 'month', 'day', 'day_of_week', 'cat_encoded', 'region_encoded', 'payment_encoded']


In [5]:
# 2. Problem Formulation â€“ Select target variables
# We will build multiple models:
#   Regression: total_revenue (continuous)
#   Regression: quantity_sold (continuous)
#   Classification: high_value (binary: revenue > $1000)
#   Classification: product_category (multi-class)

# Target: total_revenue
X = df[feature_cols]
y_rev = df['total_revenue']

# Train-test split (regression)
X_train, X_test, y_train_rev, y_test_rev = train_test_split(X, y_rev, test_size=0.2, random_state=42)

# Scale features for models that require it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Baseline Models for Regression
models_reg = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42, eval_metric='rmse')
}

print("Regression Model Performance (RMSE on test set):\n")
for name, model in models_reg.items():
    if name == 'Linear Regression':
        model.fit(X_train_scaled, y_train_rev)
        pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train_rev)
        pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test_rev, pred))
    r2 = r2_score(y_test_rev, pred)
    print(f"{name:20} RMSE: {rmse:.2f}, R2: {r2:.3f}")

Regression Model Performance (RMSE on test set):

Linear Regression    RMSE: 182.36, R2: 0.875
Decision Tree        RMSE: 2.04, R2: 1.000
Random Forest        RMSE: 1.04, R2: 1.000
Gradient Boosting    RMSE: 19.10, R2: 0.999
XGBoost              RMSE: 5.98, R2: 1.000


In [6]:
# 3. Classification: High-Value Orders (binary) 

# Define threshold $1000
df['high_value'] = (df['total_revenue'] > 1000).astype(int)

y_hv = df['high_value']
X_train_hv, X_test_hv, y_train_hv, y_test_hv = train_test_split(X, y_hv, test_size=0.2, random_state=42, stratify=y_hv)

# Baseline classifiers
models_clf = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

print("\n\nClassification (High-Value) Accuracy:\n")
for name, model in models_clf.items():
    if name == 'Logistic Regression':
        # Scale for LR
        scaler_clf = StandardScaler()
        X_train_sc = scaler_clf.fit_transform(X_train_hv)
        X_test_sc = scaler_clf.transform(X_test_hv)
        model.fit(X_train_sc, y_train_hv)
        pred = model.predict(X_test_sc)
    else:
        model.fit(X_train_hv, y_train_hv)
        pred = model.predict(X_test_hv)
    acc = accuracy_score(y_test_hv, pred)
    print(f"{name:20} Accuracy: {acc:.3f}")



Classification (High-Value) Accuracy:

Logistic Regression  Accuracy: 0.972
Decision Tree        Accuracy: 1.000
Random Forest        Accuracy: 0.997
XGBoost              Accuracy: 0.999


In [7]:
# 4. Classification: Product Category (multi-class) 

y_cat = df['cat_encoded']  # already encoded
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X, y_cat, test_size=0.2, random_state=42, stratify=y_cat)

# Multi-class models
models_multi = {
    'Logistic Regression (OvR)': LogisticRegression(multi_class='ovr', max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')
}

print("\n\nMulti-Class Classification (Category) Accuracy:\n")
for name, model in models_multi.items():
    if name.startswith('Logistic'):
        scaler_multi = StandardScaler()
        X_train_sc = scaler_multi.fit_transform(X_train_cat)
        X_test_sc = scaler_multi.transform(X_test_cat)
        model.fit(X_train_sc, y_train_cat)
        pred = model.predict(X_test_sc)
    else:
        model.fit(X_train_cat, y_train_cat)
        pred = model.predict(X_test_cat)
    acc = accuracy_score(y_test_cat, pred)
    print(f"{name:25} Accuracy: {acc:.3f}")




Multi-Class Classification (Category) Accuracy:

Logistic Regression (OvR) Accuracy: 1.000
Random Forest             Accuracy: 1.000
XGBoost                   Accuracy: 1.000
