In [None]:
# data imports
import numpy as np
import pandas as pd

# visual imports
import matplotlib.pyplot as plt 
import seaborn as sns

# data processing imports
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# model imports 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# metrics imports
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
df=pd.read_csv('data/Copy_of_sales-sales.csv')

In [None]:
df.columns = df.columns.str.strip()
df.head()

In [None]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["year"] = df["date"].dt.year

if "store_ID" in df.columns:
    df = df.rename(columns={"store_ID": "store_id"})


In [None]:
y = df["sales"]
X = df.drop(columns=["sales"])


In [None]:
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = []

# treat store_id as categorical
if "store_id" in numeric_features:
    numeric_features.remove("store_id")
    categorical_features.append("store_id")

In [None]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
model = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Linear Regression â€” Store-Year Model")
print("MAE:", round(mae, 2))
print("RMSE:", round(rmse, 2))
print("R2:", round(r2, 4))

In [None]:
rf_model = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=3,
        random_state=42,
        n_jobs=2
    ))
])

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, rf_pred)
rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
r2 = r2_score(y_test, rf_pred)

print("Random Forest")
print("MAE:", round(mae, 2))
print("RMSE:", round(rmse, 2))
print("R2:", round(r2, 4))