In [None]:
# IMPORTING MODULES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# LOADING DATASETS
sales_data=pd.read_csv('features.csv',low_memory=False)
sample_submission=pd.read_csv('sampleSubmission.csv')
stores_data=pd.read_csv('stores.csv',low_memory=False)
train_data=pd.read_csv('train.csv',low_memory=False)
test_data=pd.read_csv('test.csv',low_memory=False)

In [None]:
#MERGING THE DATASETS
sales_data_merged=pd.merge(sales_data,stores_data,on="Store",how="inner")
sales_data_merged_final=pd.merge(sales_data_merged,train_data,on=["Store","Date"],how="inner")
df=sales_data_merged_final
df.head(2)

## DATA PREPARATION

In [None]:
# Initial manipulation
df=df.drop(['IsHoliday_y'],axis=1)
df=df.rename(columns={'IsHoliday_x':'IsHoliday'})
df['Date']=pd.to_datetime(df['Date'])
df['IsHoliday']=df['IsHoliday'].astype('int32')
df['Type'] = df['Type'].replace({'A': 0, 'B': 1,'C':2})
df = df.astype({
    'Store': 'int32',
    'Temperature': 'float32',
    'Fuel_Price': 'float32',
    'MarkDown1': 'float32',
    'MarkDown2': 'float32',
    'MarkDown3': 'float32',
    'MarkDown4': 'float32',
    'MarkDown5': 'float32',
    'CPI':'float32',
    'Unemployment':'float32',
    'Size':'int32',
    'Dept':'int32',
    'Weekly_Sales':'int32'
})

In [None]:
#DATA CLEANING
df=df.fillna(0)
df['Date']=pd.to_datetime(df['Date'])
df['Month']=df['Date'].dt.month
df['Year']=df['Date'].dt.year
df['Day']=df['Date'].dt.day
df.drop(['Date'],axis=1,inplace=True)

#Creating main event column
main_events=[(25,11),(26,11),(24,12),(23,12)]

# Create (Day, Month) tuple column
df['day_month_tuple'] = list(zip(df['Day'], df['Month']))

# Flag rows where the date matches any event
df['is_main_event'] = df['day_month_tuple'].isin(main_events).astype(int)
df.drop(['day_month_tuple'],axis=1,inplace=True)

In [None]:
df['MarkDown']=df['MarkDown1']+df['MarkDown2']+df['MarkDown4']+df['MarkDown5']
df.drop(['MarkDown1','MarkDown2','MarkDown4','MarkDown5'],axis=1,inplace=True)

In [None]:
df.head(2)

In [None]:
df.describe()

In [None]:
# Sequential split: Train on data before 2012, test on 2012
train_df = df[df['Year'] < 2012]  # Training data (before 2012)
test_df = df[df['Year'] == 2012]  # Test data (2012)

# Sort the data by Store, Dept, and Date (now using Year, Month, Day)
train_df = train_df.sort_values(['Store', 'Dept', 'Year', 'Month', 'Day'])
test_df = test_df.sort_values(['Store', 'Dept', 'Year', 'Month', 'Day'])

# Create lag features for the training data (no future leakage)
train_df['Lag_1'] = train_df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
train_df['Lag_2'] = train_df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(2)
train_df['Rolling_Mean_4'] = train_df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1).rolling(window=4).mean()
train_df['Rolling_Std_4'] = train_df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1).rolling(window=4).std()

# Now prepare the test data with lag features based on training data
history = train_df.groupby(['Store', 'Dept']).tail(4)  # Keep the last 4 weeks from training data
test_prep = pd.concat([history, test_df]).sort_values(['Store', 'Dept', 'Year', 'Month', 'Day'])

# Recalculate lag features for the combined data
test_prep['Lag_1'] = test_prep.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
test_prep['Lag_2'] = test_prep.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(2)
test_prep['Rolling_Mean_4'] = test_prep.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1).rolling(window=4).mean()
test_prep['Rolling_Std_4'] = test_prep.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1).rolling(window=4).std()

# Filter the test data back out (keep only the 2012 rows)
test_df = test_prep[test_prep['Year'] == 2012]

# Prepare data for training and testing
train_x = train_df.drop(['Weekly_Sales'], axis=1)
train_y = train_df['Weekly_Sales']
test_x = test_df.drop(['Weekly_Sales'], axis=1)
test_y = test_df['Weekly_Sales']



In [None]:
df.head(2)

## MODEL TRAINING

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor
rf1 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_drop_columns=['MarkDown','Lag_1','Lag_2','Rolling_Mean_4','Rolling_Std_4','IsHoliday','Year','Fuel_Price']
rf1.fit(train_x.drop(rf_drop_columns,axis=1),train_y)
rf1_preds = rf1.predict(test_x.drop(rf_drop_columns,axis=1))
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error
print(r2_score(test_y,rf1_preds))
print(mean_absolute_error(rf1_preds,test_y))
print(root_mean_squared_error(rf1_preds,test_y))

In [None]:
#XG Boost
from xgboost import XGBRegressor
xg1 =XGBRegressor(n_estimators=200,max_depth=6,learning_rate=0.1,gamma=0)
xg_drop_columns=['MarkDown','Fuel_Price','Temperature','CPI','Unemployment']
xg1.fit(train_x.drop(xg_drop_columns,axis=1),train_y)
xg1_preds = xg1.predict(test_x.drop(xg_drop_columns,axis=1))
print(r2_score(test_y,xg1_preds))
print(mean_absolute_error(test_y,xg1_preds))
print(root_mean_squared_error(test_y,xg1_preds))

In [None]:
#LGBM 
from lightgbm import LGBMRegressor
lgb1 =LGBMRegressor(num_leaves=100,n_estimators=200,min_child_samples=20,max_depth=20,learning_rate=0.05)
lgb_drop_columns=['MarkDown','Year','IsHoliday','Type']
lgb1.fit(train_x.drop(lgb_drop_columns,axis=1),train_y)
lgb1_preds = lgb1.predict(test_x.drop(lgb_drop_columns,axis=1))
print(r2_score(test_y,lgb1_preds))
print(mean_absolute_error(test_y,lgb1_preds))
print(root_mean_squared_error(test_y,lgb1_preds))

In [None]:
final_preds=0.2*rf1_preds+0.4*xg1_preds+0.4*lgb1_preds
print(r2_score(test_y,final_preds))
print(mean_absolute_error(test_y,final_preds))
print(root_mean_squared_error(test_y,final_preds))

## HYPERPARAMETER TUNING

In [None]:
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(random_state=42)
lgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.3, 0.6, 0.9],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 10, 20],
    'min_child_samples': [5, 10, 20]
}

# Optional: Use TimeSeriesSplit if data is time-dependent
cv = TimeSeriesSplit(n_splits=5)


lgb_search = RandomizedSearchCV(
    lgb, lgb_params, n_iter=20, cv=cv, verbose=1,
    random_state=42, n_jobs=-1, scoring='neg_mean_absolute_error'
)

lgb_search.fit(train_x.drop(lgb_drop_columns, axis=1), train_y)
best_lgb = lgb_search.best_estimator_

# Use matching features on test set
def evaluate_model(model, test_x, test_y, name="Model"):
    preds = model.predict(test_x)
    print(f"\n📊 {name} Performance on Test Set:")
    print("R² Score:", r2_score(test_y, preds))
    print("MAE:", mean_absolute_error(test_y, preds))
    print("RMSE:", root_mean_squared_error(test_y, preds))
    print(lgb_search.best_params_)
evaluate_model(best_lgb, test_x.drop(lgb_drop_columns, axis=1), test_y, "LightGBM")


In [None]:
print(lgb_search.best_params_)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define base model
xgb = XGBRegressor(random_state=42)

# Define parameter grid
xgb_params = {
    'n_estimators': [150, 200, 300,400],
    'learning_rate': [0.1, 0.3, 0.5, 0.7],
    'max_depth': [None,3, 6, 10],
    'gamma': [0, 0.1, 0.3]
}

# Optional: use TimeSeriesSplit if data is sequential
from sklearn.model_selection import TimeSeriesSplit
cv = TimeSeriesSplit(n_splits=5)

xgb_search = RandomizedSearchCV(
    xgb, xgb_params, n_iter=20, cv=cv, verbose=1,
    random_state=42, n_jobs=-1, scoring='neg_mean_absolute_error'
)

# Fit model (make sure xg_drop_columns is defined)
xgb_search.fit(train_x.drop(xg_drop_columns, axis=1), train_y)

# Get best model
best_xgb = xgb_search.best_estimator_

# Evaluate
evaluate_model(best_xgb, test_x.drop(xg_drop_columns, axis=1), test_y, "XGBoost")

# Show best parameters
print("\nBest Parameters for XGBoost:")
print(xgb_search.best_params_)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define base model
rf = RandomForestRegressor(random_state=42)

# Define hyperparameter grid
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}


from sklearn.model_selection import TimeSeriesSplit
cv = TimeSeriesSplit(n_splits=5)

rf_search = RandomizedSearchCV(
    rf, rf_params, n_iter=20, cv=cv, verbose=1,
    random_state=42, n_jobs=-1, scoring='neg_mean_absolute_error'
)

# Fit model (ensure rf_drop_columns is defined)
rf_search.fit(train_x.drop(rf_drop_columns, axis=1), train_y)

# Best model
best_rf = rf_search.best_estimator_

# Evaluate
evaluate_model(best_rf, test_x.drop(rf_drop_columns, axis=1), test_y, "Random Forest")

# Show best parameters
print("\nBest Parameters for Random Forest:")
print(rf_search.best_params_)


# prediction

In [None]:
import pandas as pd
import numpy as np

# Load all required datasets
features = pd.read_csv('features.csv')
stores = pd.read_csv('stores.csv')
test = pd.read_csv('test.csv')

# Merge features and stores with test
test_merged = pd.merge(test, features, on=['Store', 'Date'], how='left')
test_merged = pd.merge(test_merged, stores, on='Store', how='left')

# Preprocessing (same as training)
if 'IsHoliday_x' in test_merged.columns and 'IsHoliday_y' in test_merged.columns:
    test_merged = test_merged.drop('IsHoliday_y', axis=1)
    test_merged = test_merged.rename(columns={'IsHoliday_x': 'IsHoliday'})
test_merged['Date'] = pd.to_datetime(test_merged['Date'])
test_merged['IsHoliday'] = test_merged['IsHoliday'].astype('int32')
test_merged['Type'] = test_merged['Type'].replace({'A': 0, 'B': 1, 'C': 2})

# Handle datatypes
test_merged = test_merged.astype({
    'Store': 'int32',
    'Temperature': 'float32',
    'Fuel_Price': 'float32',
    'MarkDown1': 'float32',
    'MarkDown2': 'float32',
    'MarkDown3': 'float32',
    'MarkDown4': 'float32',
    'MarkDown5': 'float32',
    'CPI': 'float32',
    'Unemployment': 'float32',
    'Size': 'int32',
    'Dept': 'int32'
})
test_merged.fillna(0, inplace=True)

# Extract date parts
test_merged['Month'] = test_merged['Date'].dt.month
test_merged['Year'] = test_merged['Date'].dt.year
test_merged['Day'] = test_merged['Date'].dt.day

# Add main event flag
main_events = [(25, 11), (26, 11), (24, 12), (23, 12)]
test_merged['day_month_tuple'] = list(zip(test_merged['Day'], test_merged['Month']))
test_merged['is_main_event'] = test_merged['day_month_tuple'].isin(main_events).astype(int)
test_merged.drop(['day_month_tuple'], axis=1, inplace=True)

# Recreate Date column for sorting
test_merged['Date'] = pd.to_datetime(test_merged[['Year', 'Month', 'Day']])
test_merged.sort_values(['Store', 'Dept', 'Date'], inplace=True)

# Add lag/rolling features by merging with train data
# Load and process train data to calculate lags
train = pd.read_csv('train.csv')
train['Date'] = pd.to_datetime(train['Date'])
features['Date']=pd.to_datetime(features['Date'])
train_full = pd.merge(train, features, on=['Store', 'Date'], how='left')
train_full = pd.merge(train_full, stores, on='Store', how='left')

# Same preprocessing on train_full
if 'IsHoliday_x' in train_full.columns and 'IsHoliday_y' in train_full.columns:
    train_full = train_full.drop('IsHoliday_y', axis=1)
    train_full = train_full.rename(columns={'IsHoliday_x': 'IsHoliday'})
train_full['IsHoliday'] = train_full['IsHoliday'].astype('int32')
train_full['Type'] = train_full['Type'].replace({'A': 0, 'B': 1, 'C': 2})
train_full.fillna(0, inplace=True)
train_full['Month'] = train_full['Date'].dt.month
train_full['Year'] = train_full['Date'].dt.year
train_full['Day'] = train_full['Date'].dt.day
train_full['day_month_tuple'] = list(zip(train_full['Day'], train_full['Month']))
train_full['is_main_event'] = train_full['day_month_tuple'].isin(main_events).astype(int)
train_full['Date'] = pd.to_datetime(train_full[['Year', 'Month', 'Day']])
train_full.sort_values(['Store', 'Dept', 'Date'], inplace=True)

# Combine train and test
train_full['source'] = 'train'
test_merged['source'] = 'test'
combined = pd.concat([train_full, test_merged], sort=False)

# Add lags and rolling features
combined['Lag_1'] = combined.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
combined['Lag_2'] = combined.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(2)
combined['Rolling_Mean_4'] = combined.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1).rolling(window=4).mean()
combined['Rolling_Std_4'] = combined.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1).rolling(window=4).std()

# MarkDown total
combined['MarkDown'] = combined['MarkDown1'] + combined['MarkDown2'] + combined['MarkDown4'] + combined['MarkDown5']

# Drop unnecessary columns
drop_cols = ['MarkDown1', 'MarkDown2', 'MarkDown4', 'MarkDown5', 'day_month_tuple', 'Date', 'source']
combined.drop(columns=drop_cols, inplace=True)

# Separate test portion
test_final = combined[combined['Weekly_Sales'].isna()].copy()
test_final.drop(columns=['Weekly_Sales'], inplace=True)

# Drop columns as used during training
X_test =test_final

# Predict using the best XGBoost model
xg__preds = xg1.predict(X_test[xg_feature_order])
rf__preds= rf1.predict(X_test[rf_feature_order])
lgb__preds=lgb1.predict(X_test[lgb_feature_order])
final_preds=0.4*xg__preds+0.4*lgb__preds+0.2*rf__preds

# Save results
submission = pd.DataFrame({
    'StoreId': test['Store'],
    'Dept':test['Dept'],
    'Date':test['Date'],
    'Weekly_Sales': final_preds
})
submission.to_csv("submission.csv", index=False)


In [None]:
xg_feature_order = train_x.drop(xg_drop_columns,axis=1).columns.tolist()
xg_feature_order

In [None]:
rf_feature_order = train_x.drop(rf_drop_columns,axis=1).columns.tolist()
rf_feature_order

In [None]:
lgb_feature_order = train_x.drop(lgb_drop_columns,axis=1).columns.tolist()
lgb_feature_order

In [None]:
sns.scatterplot(x=submission['Date'],y=submission['Weekly_Sales'])

In [None]:
sns.scatterplot(x=test_x['Day'],y=test_y,color='green')
sns.scatterplot(x=test_x['Day'],y=rf1_preds,color='red',alpha=0.5)

In [None]:
sns.scatterplot(x=test_x['Day'],y=test_y,color='green')
sns.scatterplot(x=test_x['Day'],y=xg1_preds,color='red',alpha=0.5)

In [None]:
sns.scatterplot(x=test_x['Day'],y=test_y,color='green')
sns.scatterplot(x=test_x['Day'],y=lgb1_preds,color='red',alpha=0.5)