In [None]:
import pandas as pd
import numpy as np

# **Loding data and preprocessing**

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
store = pd.read_csv('store.csv')


  train = pd.read_csv('train.csv')


In [None]:
train = pd.merge(train, store, on='Store', how='left')
test = pd.merge(test, store, on='Store', how='left')


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  int64  
 1   DayOfWeek                  1017209 non-null  int64  
 2   Date                       1017209 non-null  object 
 3   Sales                      1017209 non-null  int64  
 4   Customers                  1017209 non-null  int64  
 5   Open                       1017209 non-null  int64  
 6   Promo                      1017209 non-null  int64  
 7   StateHoliday               1017209 non-null  object 
 8   SchoolHoliday              1017209 non-null  int64  
 9   StoreType                  1017209 non-null  object 
 10  Assortment                 1017209 non-null  object 
 11  CompetitionDistance        1014567 non-null  float64
 12  CompetitionOpenSinceMonth  693861 non-null   float64
 13  CompetitionO

In [None]:
train.isnull().sum()

Unnamed: 0,0
Store,0
DayOfWeek,0
Date,0
Sales,0
Customers,0
Open,0
Promo,0
StateHoliday,0
SchoolHoliday,0
StoreType,0


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Id                         41088 non-null  int64  
 1   Store                      41088 non-null  int64  
 2   DayOfWeek                  41088 non-null  int64  
 3   Date                       41088 non-null  object 
 4   Open                       41077 non-null  float64
 5   Promo                      41088 non-null  int64  
 6   StateHoliday               41088 non-null  object 
 7   SchoolHoliday              41088 non-null  int64  
 8   StoreType                  41088 non-null  object 
 9   Assortment                 41088 non-null  object 
 10  CompetitionDistance        40992 non-null  float64
 11  CompetitionOpenSinceMonth  25872 non-null  float64
 12  CompetitionOpenSinceYear   25872 non-null  float64
 13  Promo2                     41088 non-null  int

In [None]:
test.isnull().sum()


Unnamed: 0,0
Id,0
Store,0
DayOfWeek,0
Date,0
Open,11
Promo,0
StateHoliday,0
SchoolHoliday,0
StoreType,0
Assortment,0


# **Feature Engineering**

In [None]:
train.isnull().sum()

Unnamed: 0,0
Store,0
DayOfWeek,0
Date,0
Sales,0
Customers,0
Open,0
Promo,0
StateHoliday,0
SchoolHoliday,0
StoreType,0


In [None]:
train['CompetitionDistance'] = train['CompetitionDistance'].fillna(train['CompetitionDistance'].mean())
test['CompetitionDistance'] = test['CompetitionDistance'].fillna(test['CompetitionDistance'].mean())


In [None]:
cols_to_fill_0 = ['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
                  'Promo2SinceWeek', 'Promo2SinceYear', 'CompetitionOpen', 'Promo2Open']


for col in cols_to_fill_0:
    if col in train.columns:
        train[col] = train[col].fillna(0)
    if col in test.columns:
        test[col] = test[col].fillna(0)

In [None]:
test['Open'] = test['Open'].fillna(1)
train = train[(train['Open'] != 0) & (train['Sales'] != 0)]

In [None]:
def feature_engineering(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week

    # 2. Calculate Competition Duration (in Months)
    df['CompetitionOpen'] = 12 * (df['Year'] - df['CompetitionOpenSinceYear']) + \
                            (df['Month'] - df['CompetitionOpenSinceMonth'])
    df.loc[df['CompetitionOpenSinceYear'] == 0, 'CompetitionOpen'] = 0
    df['CompetitionOpen'] = df['CompetitionOpen'].clip(lower=0)

    # 3. Calculate Promo2 Duration (in Weeks)
    df['Promo2Open'] = 52 * (df['Year'] - df['Promo2SinceYear']) + \
                       (df['WeekOfYear'] - df['Promo2SinceWeek'])
    df.loc[df['Promo2SinceYear'] == 0, 'Promo2Open'] = 0
    df['Promo2Open'] = df['Promo2Open'].clip(lower=0)

    # 4. Check if current month is in PromoInterval
    month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
                 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
    df['MonthStr'] = df['Month'].map(month_map)

    # Create the boolean flag
    df['IsPromo2Month'] = df.apply(lambda x: 1 if (x['PromoInterval'] != 'None' and x['MonthStr'] in str(x['PromoInterval'])) else 0, axis=1)

    # 5. Mapping Categorical Variables to Numbers
    df['StateHoliday'] = df['StateHoliday'].astype(str).map({'0':0, 'a':1, 'b':2, 'c':3})
    df['StoreType'] = df['StoreType'].map({'a':1, 'b':2, 'c':3, 'd':4})
    df['Assortment'] = df['Assortment'].map({'a':1, 'b':2, 'c':3})

    return df


In [None]:
train = feature_engineering(train)
test = feature_engineering(test)

In [None]:
print(train.isnull().sum())
print(test.isnull().sum())


Store                             0
DayOfWeek                         0
Date                              0
Sales                             0
Customers                         0
Open                              0
Promo                             0
StateHoliday                      0
SchoolHoliday                     0
StoreType                         0
Assortment                        0
CompetitionDistance               0
CompetitionOpenSinceMonth         0
CompetitionOpenSinceYear          0
Promo2                            0
Promo2SinceWeek                   0
Promo2SinceYear                   0
PromoInterval                423292
Year                              0
Month                             0
Day                               0
WeekOfYear                        0
CompetitionOpen                   0
Promo2Open                        0
MonthStr                          0
IsPromo2Month                     0
dtype: int64
Id                               0
Store           

In [None]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.29.3-py3-none-manylinux_2_18_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl (131.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.7/131.7 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.29.3-py3-none-manylinux_2_18_x86_64.whl (289.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.8/289.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.29.3 xgboost-3.2.0


In [None]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

features = [
    'Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
    'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen',
    'Promo2', 'Promo2Open', 'IsPromo2Month', 'Year', 'Month', 'Day', 'WeekOfYear'
]
X_train = train[features]
y_train = np.log1p(train['Sales'])

In [None]:
base_model = xgb.XGBRegressor(random_state=2026, n_jobs=-1)
param_grid = {
    'n_estimators': [100,300, 400, 500],
    'learning_rate': [0.05,0.01, 0.1, 0.2],
    'max_depth': [6, 8, 12],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}
print("Setting up the Parameter Search...")
random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_grid,
    n_iter=10,             # It will try 10 completely random combinations from the menu above
    scoring='neg_mean_squared_error', # How it grades the test
    cv=3,                  # It will double-check its work 3 times per combination
    verbose=2,             # Prints progress updates
    random_state=2025,
    n_jobs=-1              # Use all CPU cores
)
random_search.fit(X_train, y_train)
print("The absolute best parameters are:")
print(random_search.best_params_)

best_model = random_search.best_estimator_

Setting up the Parameter Search...
Fitting 3 folds for each of 10 candidates, totalling 30 fits




The absolute best parameters are:
{'subsample': 0.8, 'n_estimators': 500, 'max_depth': 12, 'learning_rate': 0.05, 'colsample_bytree': 0.9}


In [None]:
X_test = test[features]

print("Making predictions...")
log_predictions = best_model.predict(X_test)


real_predictions = np.expm1(log_predictions)

submission = pd.DataFrame({
    "Id": test["Id"],
    "Sales": real_predictions
})

submission.to_csv("submission.csv", index=False)

print("SUCCESS! 'submission.csv' has been saved.")
print(submission.head())

Making predictions...
SUCCESS! 'submission.csv' has been saved.
   Id        Sales
0   1  4275.818848
1   2  7412.008789
2   3  9043.420898
3   4  6633.801758
4   5  7126.735840
