
# Black Friday Purchase Prediction — PyCaret-Free (scikit-learn) Version

This notebook replaces **PyCaret** with a **pure scikit-learn + model zoo** workflow so it runs on **Python 3.10 / 3.11 / 3.12**.
It auto-detects numeric vs categorical columns, builds robust preprocessing with `ColumnTransformer`, evaluates a set of models,
compares metrics (RMSE, MAE, R²), picks the best model, and generates predictions.

> **Assumptions:**  
> - Target column is **`Purchase`** (change `TARGET_COL` below if different).  
> - Training data path is `train.csv` in the same folder as this notebook.  
> - Optional libraries (XGBoost, LightGBM, CatBoost) are used **if installed**; otherwise they are skipped automatically.


In [1]:

# ✅ Works on Python 3.10+; no PyCaret required.
import sys, os, warnings, math, json
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Core regressors
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Optional: XGBoost / LightGBM / CatBoost (used if available)
xgb_available = lgb_available = ctb_available = False
try:
    from xgboost import XGBRegressor
    xgb_available = True
except Exception as e:
    XGBRegressor = None

try:
    from lightgbm import LGBMRegressor
    lgb_available = True
except Exception as e:
    LGBMRegressor = None

try:
    from catboost import CatBoostRegressor
    ctb_available = True
except Exception as e:
    CatBoostRegressor = None

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
print(f"Python: {sys.version.split()[0]} | xgboost={xgb_available} | lightgbm={lgb_available} | catboost={ctb_available}")


Python: 3.9.12 | xgboost=False | lightgbm=False | catboost=False


In [2]:

# === Load data ===
DATA_PATH = 'train.csv'  # Change if your file is elsewhere
TARGET_COL = 'Purchase'  # Change if your target is different

df = pd.read_csv(r"C:\Users\dell\Downloads\Black Friday EDA + Prediction\train.csv\train.csv")
print(df.shape)
df.head()


(550068, 12)


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [3]:

# Quick overview
display(df.describe(include='all').transpose().head(20))
print("\nNull counts (top 20):\n", df.isna().sum().sort_values(ascending=False).head(20))


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
User_ID,550068.0,,,,1003028.842401,1727.591586,1000001.0,1001516.0,1003077.0,1004478.0,1006040.0
Product_ID,550068.0,3631.0,P00265242,1880.0,,,,,,,
Gender,550068.0,2.0,M,414259.0,,,,,,,
Age,550068.0,7.0,26-35,219587.0,,,,,,,
Occupation,550068.0,,,,8.076707,6.52266,0.0,2.0,7.0,14.0,20.0
City_Category,550068.0,3.0,B,231173.0,,,,,,,
Stay_In_Current_City_Years,550068.0,5.0,1,193821.0,,,,,,,
Marital_Status,550068.0,,,,0.409653,0.49177,0.0,0.0,0.0,1.0,1.0
Product_Category_1,550068.0,,,,5.40427,3.936211,1.0,1.0,5.0,8.0,20.0
Product_Category_2,376430.0,,,,9.842329,5.08659,2.0,5.0,9.0,15.0,18.0



Null counts (top 20):
 Product_Category_3            383247
Product_Category_2            173638
User_ID                            0
Product_ID                         0
Age                                0
Gender                             0
Occupation                         0
City_Category                      0
Marital_Status                     0
Stay_In_Current_City_Years         0
Product_Category_1                 0
Purchase                           0
dtype: int64


In [4]:

# === Features / Target split ===
assert TARGET_COL in df.columns, f"Target column '{TARGET_COL}' not found in data!"
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

# Identify column types
num_cols = X.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

print(f"Numeric cols: {len(num_cols)} | Categorical cols: {len(cat_cols)}")


Numeric cols: 6 | Categorical cols: 5


In [5]:

# === Preprocessing pipelines ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=True, with_std=True))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop'
)


In [6]:

# === Model zoo (PyCaret-like compare_models) ===
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(random_state=RANDOM_STATE),
    'Lasso': Lasso(random_state=RANDOM_STATE, max_iter=20000),
    'ElasticNet': ElasticNet(random_state=RANDOM_STATE, max_iter=20000),
    'KNN': KNeighborsRegressor(n_neighbors=5),
    'SVR': SVR(),
    'RandomForest': RandomForestRegressor(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1),
    'ExtraTrees': ExtraTreesRegressor(n_estimators=400, random_state=RANDOM_STATE, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(random_state=RANDOM_STATE)
}

if xgb_available:
    models['XGBRegressor'] = XGBRegressor(
        n_estimators=600, learning_rate=0.05, max_depth=8, subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_STATE, n_jobs=-1, tree_method='hist'
    )
if lgb_available:
    models['LGBMRegressor'] = LGBMRegressor(
        n_estimators=1000, learning_rate=0.05, num_leaves=63, subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_STATE
    )
if ctb_available:
    models['CatBoostRegressor'] = CatBoostRegressor(
        iterations=1000, learning_rate=0.05, depth=8, random_state=RANDOM_STATE, verbose=False
    )

print(f"Models configured: {list(models.keys())}")


Models configured: ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'KNN', 'SVR', 'RandomForest', 'ExtraTrees', 'GradientBoosting']


In [None]:

# === Train/Test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# === Cross-validated comparison ===
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def rmse_cv(estimator, X, y):
    # Negative MSE -> take sqrt after negation
    scores = cross_val_score(estimator, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    return np.sqrt(-scores)

results = []
fitted_models = {}

for name, model in models.items():
    pipe = Pipeline(steps=[('preprocess', preprocessor), ('model', model)])
    # Compute CV RMSE (primary metric)
    cv_rmse = rmse_cv(pipe, X_train, y_train)
    results.append({
        'model': name,
        'cv_rmse_mean': cv_rmse.mean(),
        'cv_rmse_std': cv_rmse.std()
    })
    # Fit on full train
    pipe.fit(X_train, y_train)
    fitted_models[name] = pipe

results_df = pd.DataFrame(results).sort_values('cv_rmse_mean')
results_df.reset_index(drop=True, inplace=True)
results_df


In [None]:

# === Pick best model (lowest CV RMSE) ===
best_name = results_df.iloc[0]['model']
best_model = fitted_models[best_name]
print(f"Best model by CV RMSE: {best_name}")

# === Holdout evaluation ===
preds = best_model.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)
print({"RMSE": rmse, "MAE": mae, "R2": r2})


In [None]:

# === Feature importance (tree-based only) ===
import numpy as np
try:
    model_step = best_model.named_steps['model']
    preprocess_step = best_model.named_steps['preprocess']
    # Extract feature names post-encoding
    ohe = preprocess_step.named_transformers_['cat'].named_steps['onehot']
    cat_feature_names = ohe.get_feature_names_out(preprocess_step.transformers_[1][2])
    num_feature_names = preprocess_step.transformers_[0][2]
    feature_names = np.concatenate([num_feature_names, cat_feature_names])

    if hasattr(model_step, 'feature_importances_'):
        importances = model_step.feature_importances_
        fi = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False).head(30)
        display(fi)
    else:
        print("Best model doesn't expose feature_importances_. Skipping.")
except Exception as e:
    print("Feature importance extraction skipped:", e)


In [None]:

# === Save results and model ===
import joblib, json, os

os.makedirs('artifacts', exist_ok=True)
results_df.to_csv('artifacts/model_comparison.csv', index=False)
joblib.dump(best_model, 'artifacts/best_model.joblib')

summary = {
    'best_model': str(best_model),
    'holdout': {'rmse': float(rmse), 'mae': float(mae), 'r2': float(r2)},
    'models_tried': results_df['model'].tolist()
}
with open('artifacts/summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Saved: artifacts/model_comparison.csv, artifacts/best_model.joblib, artifacts/summary.json")


In [None]:

# === Inference example ===
# To use the trained best model on new data (same schema as X):
# new_data = pd.read_csv('test.csv')
# preds_new = best_model.predict(new_data)
# pd.DataFrame({'Prediction': preds_new}).to_csv('artifacts/test_predictions.csv', index=False)
# print('Saved: artifacts/test_predictions.csv')
