In [35]:
# imports
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import root_mean_squared_error, r2_score


In [37]:
# load and merge data
train_df = pd.read_csv(
    "train.csv",
    parse_dates=["Date"],
    dtype={"StateHoliday": "str"}
)

store_df = pd.read_csv("store.csv", low_memory=False)

train_df = train_df.merge(store_df, on="Store", how="left")

# feature engg.
# Avoid division by zero
train_df["Sales_per_Customer"] = (
    train_df["Sales"] / train_df["Customers"].replace(0, 1)
)

train_df["DayOfWeek"] = train_df["Date"].dt.weekday
train_df["IsWeekend"] = (train_df["DayOfWeek"] >= 5).astype(int)
train_df["Month"] = train_df["Date"].dt.month

train_df["CompetitionDistance"] = pd.to_numeric(
    train_df["CompetitionDistance"], errors="coerce"
)

train_df["CompetitionDistance"] = (
    train_df["CompetitionDistance"]
    .fillna(train_df["CompetitionDistance"].median())
)

train_df["Log_CompetitionDistance"] = np.log1p(
    train_df["CompetitionDistance"]
)

train_df["Promo_StoreType"] = (
    train_df["Promo"].astype(str) + "_" + train_df["StoreType"]
)

# feature selection
features = [
    "Customers",
    "Sales_per_Customer",
    "Promo",
    "DayOfWeek",
    "IsWeekend",
    "Month",
    "StoreType",
    "Assortment",
    "Promo_StoreType",
    "Log_CompetitionDistance"
]
# Features and target
X = train_df.drop(columns=["Sales"])
y = train_df["Sales"]
# train/ validation split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

#X = train_df[features]
#y = train_df[Sales]


# SPEED BOOST — use only 20% of training data
X_train = X_train.sample(frac=0.2, random_state=42)
y_train = y_train.loc[X_train.index]
# Check shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape :", X_test.shape)
print("y_test shape :", y_test.shape)

# Check for categorical columns
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
print("Categorical columns:", cat_cols)

# preprocessing pipeline
num_features = [
    "Customers",
    "Sales_per_Customer",
    "Log_CompetitionDistance",
    "DayOfWeek"
]

cat_features = [
    "Promo",
    "IsWeekend",
    "Month",
    "StoreType",
    "Assortment",
    "Promo_StoreType"
]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Model training
rf_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=50,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ))
])

print("Training model...")
rf_pipeline.fit(X_train, y_train)

# evaluation 
#preds = rf_pipeline.predict(X_val)
#rmse = mean_squared_error(y_val, preds, squared=False)
# Predict on test set
y_pred = rf_pipeline.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")
print(f"R²  : {r2:.4f}")
print("Validation RMSE:", rmse)

# save model
os.makedirs("models", exist_ok=True)

joblib.dump(
    rf_pipeline,
    "models/best_sales_forecast_model.pkl"
)

print("Model saved to models/best_sales_forecast_model.pkl")

X_train shape: (162753, 22)
y_train shape: (162753,)
X_test shape : (203442, 22)
y_test shape : (203442,)
Categorical columns: ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval', 'Promo_StoreType']
Training model...
RMSE: 96.42
MAE : 44.28
R²  : 0.9993
Validation RMSE: 96.41910576229598
Model saved to models/best_sales_forecast_model.pkl


In [4]:
print(os.getcwd())

C:\Users\LENOVO\NextHikes\Project 6 Forecasting Pharmaceuticals


In [5]:
os.makedirs("models", exist_ok=True)

joblib.dump(
    {"test": "file_write_check"},
    "models/test.pkl"
)

print("Saved test file")


Saved test file


In [6]:
rf_pipeline.fit(X_train, y_train)

import joblib, os
os.makedirs("models", exist_ok=True)

joblib.dump(
    rf_pipeline,
    "models/best_sales_forecast_model.pkl"
)

print("MODEL SAVED")


MODEL SAVED
