In [1]:
# Optimized Daily Sales Prediction (Linear Regression Only)
# Dataset: sales-sales.csv
# Goal: Use ALL columns and maximize performance using Linear Regression


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [3]:
# Load dataset
df = pd.read_csv("sales-sales.csv")
df.columns = df.columns.str.strip().str.lower()

df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["weekofyear"] = df["date"].dt.isocalendar().week.astype(int)

df.head()


Unnamed: 0,unnamed: 0,store_id,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales,year,month,day,weekofyear
0,425390,366,4,2013-04-18,517,1,0,0,0,4422,2013,4,18,16
1,291687,394,6,2015-04-11,694,1,0,0,0,8297,2015,4,11,15
2,411278,807,4,2013-08-29,970,1,1,0,0,9729,2013,8,29,35
3,664714,802,2,2013-05-28,473,1,1,0,0,6513,2013,5,28,22
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882,2013,10,10,41


In [4]:
# Use ALL columns except target
y = df["sales"]
X = df.drop(columns=["sales"])

X.head()


Unnamed: 0,unnamed: 0,store_id,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,year,month,day,weekofyear
0,425390,366,4,2013-04-18,517,1,0,0,0,2013,4,18,16
1,291687,394,6,2015-04-11,694,1,0,0,0,2015,4,11,15
2,411278,807,4,2013-08-29,970,1,1,0,0,2013,8,29,35
3,664714,802,2,2013-05-28,473,1,1,0,0,2013,5,28,22
4,540835,726,4,2013-10-10,1068,1,1,0,0,2013,10,10,41


In [5]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [6]:
# Preprocessing (handle numeric + categorical correctly)
numeric_features = X.select_dtypes(include="number").columns.tolist()
categorical_features = X.select_dtypes(exclude="number").columns.tolist()

# Treat store_id as categorical even if numeric
if "store_id" in numeric_features:
    numeric_features.remove("store_id")
    categorical_features.append("store_id")

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])


In [7]:
# Train Linear Regression
model = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [8]:
# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Linear Regression Performance")
print("MAE:", round(mae, 2))
print("RMSE:", round(rmse, 2))
print("R2:", round(r2, 4))


Linear Regression Performance
MAE: 491.51
RMSE: 722.36
R2: 0.9647


In [9]:
# Feature Importance (coefficients)
feature_names = model.named_steps["preprocess"].get_feature_names_out()
coefs = model.named_steps["model"].coef_

importance_df = (
    pd.DataFrame({
        "feature": feature_names,
        "coef": coefs,
        "abs_coef": np.abs(coefs)
    })
    .sort_values("abs_coef", ascending=False)
    .reset_index(drop=True)
)

importance_df.head(20)


Unnamed: 0,feature,coef,abs_coef
0,cat__store_id_769,-13283.428224,13283.428224
1,cat__store_id_733,-11808.57736,11808.57736
2,cat__store_id_1097,-9628.919432,9628.919432
3,cat__store_id_842,7298.675044,7298.675044
4,cat__store_id_259,-7201.745283,7201.745283
5,cat__store_id_948,-7004.554516,7004.554516
6,cat__store_id_562,-6516.126084,6516.126084
7,cat__store_id_353,-6327.808127,6327.808127
8,cat__store_id_676,-5994.225628,5994.225628
9,cat__store_id_262,-5898.619452,5898.619452
