In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_percentage_error


In [3]:
train=pd.read_csv('train.csv', encoding='utf-8')
test =pd.read_csv('test.csv', encoding='utf-8')
test_org=test


In [4]:
train.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [5]:
print(train.isnull().sum())


id             0
date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64


In [6]:
print(test.isnull().sum())


id         0
date       0
country    0
store      0
product    0
dtype: int64


In [7]:
numerical_cols = train.select_dtypes(include=['float64', 'int32']).columns.tolist()

In [8]:
numerical_cols

['num_sold']

In [9]:
# col='date'
def transform_date(df, col):
    df[col] = pd.to_datetime(df[col])
                             
    df[f'{col}_year'] = df[col].dt.year.astype('float64')
    df[f'{col}_month'] = df[col].dt.month.astype('float64')
    df[f'{col}_day'] = df[col].dt.day.astype('float64')
    df[f'{col}_day_of_week'] = df[col].dt.dayofweek.astype('float64')
    df[f'{col}_hour'] = df[col].dt.hour.astype('float64')
    df[f'{col}_minute'] = df[col].dt.minute.astype('float64')
    
    df[f'{col}_year_sin'] = np.sin(2 * np.pi * df[f'{col}_year'])
    df[f'{col}_year_cos'] = np.cos(2 * np.pi * df[f'{col}_year'])
    df[f'{col}_month_sin'] = np.sin(2 * np.pi * df[f'{col}_month'] / 12) 
    df[f'{col}_month_cos'] = np.cos(2 * np.pi * df[f'{col}_month'] / 12)
    return df

In [10]:
train = transform_date(train, 'date')
test = transform_date(test, 'date')

In [11]:
train.head()

Unnamed: 0,id,date,country,store,product,num_sold,date_year,date_month,date_day,date_day_of_week,date_hour,date_minute,date_year_sin,date_year_cos,date_month_sin,date_month_cos
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025


In [12]:
numerical_cols = train.select_dtypes(include=['float64', 'int32']).columns.tolist()

In [13]:
numerical_cols

['num_sold',
 'date_year',
 'date_month',
 'date_day',
 'date_day_of_week',
 'date_hour',
 'date_minute',
 'date_year_sin',
 'date_year_cos',
 'date_month_sin',
 'date_month_cos']

In [16]:
categorical_columns = train.select_dtypes(include=['object', 'category']).columns

encoded_train = pd.get_dummies(train, columns=categorical_columns, drop_first=True)
correlation_matrix = encoded_train.corr()
correlation_with_target = correlation_matrix['num_sold'].sort_values(ascending=False)
print(correlation_with_target)

num_sold                      1.000000
country_Norway                0.444573
product_Kaggle                0.356331
store_Premium Sticker Mart    0.231209
product_Kaggle Tiers          0.197389
store_Stickers for Less       0.089933
country_Singapore             0.073334
date_day_of_week              0.069613
date_month_sin                0.014119
date_day                      0.001137
date_month_cos               -0.001781
country_Finland              -0.002843
date_month                   -0.006255
date_year                    -0.040462
id                           -0.040866
date                         -0.040936
date_year_sin                -0.047414
product_Kerneler Dark Mode   -0.075360
country_Italy                -0.139527
product_Kerneler             -0.145131
country_Kenya                -0.449873
date_hour                          NaN
date_minute                        NaN
date_year_cos                      NaN
Name: num_sold, dtype: float64


In [18]:
from sklearn.preprocessing import LabelEncoder

# Apply LabelEncoder to all categorical columns
label_encoders = {}  # To store the encoders for each column
for col in train.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])
    label_encoders[col] = le  # Save the encoder for inverse transform if needed


In [19]:
train.drop(['date','id'], axis=1, inplace=True)
train = train.dropna()
test = test.dropna()
train.head()

Unnamed: 0,country,store,product,num_sold,date_year,date_month,date_day,date_day_of_week,date_hour,date_minute,date_year_sin,date_year_cos,date_month_sin,date_month_cos
1,0,0,1,973.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025
2,0,0,2,906.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025
3,0,0,3,423.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025
4,0,0,4,491.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025
5,0,2,0,300.0,2010.0,1.0,1.0,4.0,0.0,0.0,-1.370366e-13,1.0,0.5,0.866025


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt  # Imported for plotting

# Assuming df1 is your original DataFrame with a column 'SalePrice'
# Replace df1 with your actual data if necessary


# Initialize StandardScaler objects
scaler_X = StandardScaler()
scaler_Y = StandardScaler()

# Separate features and target variable
X= train.drop(columns=['num_sold'])
y = train['num_sold']

# Fit and transform the feature data
X = scaler_X.fit_transform(X)

# Convert the scaled features back into a DataFrame with the original column names
#X = pd.DataFrame(X, columns=train.columns)

# Since SalePrice is the target, we scale it as well (for better model performance)

import optuna
from xgboost import XGBRegressor
def objective(trial):
    
    xgb_params = {
        'device': 'cpu',
        'n_estimators': trial.suggest_int('n_estimators', 200,2000) ,
        'learning_rate': trial.suggest_float('learning_rate',0.0001, 0.01), 
        'max_depth': trial.suggest_int('max_depth', 3,20), 
        'min_child_weight': trial.suggest_int('min_child_weight',5, 100),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0), 
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0), 
        'gamma': trial.suggest_float('gamma', 0.001,1.0), 
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 1.0),
        'enable_categorical':True,
        'random_state':42
        
    }
    model = XGBRegressor(**xgb_params)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    y_pred = model.predict(X_val)


    mape = mean_absolute_percentage_error(y_val, y_pred)

    return mape

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2025-01-02 07:43:08,271] A new study created in memory with name: no-name-4fa0818b-acce-4577-bd81-af60768d943b
[I 2025-01-02 07:43:26,345] Trial 0 finished with value: 3.1358257217396486 and parameters: {'n_estimators': 1819, 'learning_rate': 0.0008385374629910217, 'max_depth': 5, 'min_child_weight': 36, 'subsample': 0.8406112550786355, 'colsample_bytree': 0.6825392176691383, 'gamma': 0.12529970277091232, 'reg_alpha': 0.7283159843207773}. Best is trial 0 with value: 3.1358257217396486.
[I 2025-01-02 07:43:52,900] Trial 1 finished with value: 0.8506888365789037 and parameters: {'n_estimators': 1520, 'learning_rate': 0.009632549508514164, 'max_depth': 12, 'min_child_weight': 95, 'subsample': 0.48416122180114574, 'colsample_bytree': 0.4455042347590919, 'gamma': 0.16445086902919154, 'reg_alpha': 0.16704921584283233}. Best is trial 1 with value: 0.8506888365789037.
[I 2025-01-02 07:44:12,296] Trial 2 finished with value: 0.2284690505367954 and parameters: {'n_estimators': 1203, 'learning

In [22]:
study.best_params


{'n_estimators': 1240,
 'learning_rate': 0.006448802890074519,
 'max_depth': 16,
 'min_child_weight': 67,
 'subsample': 0.45295911361751967,
 'colsample_bytree': 0.9948772889555972,
 'gamma': 0.4765642517507076,
 'reg_alpha': 0.7714764479107313}

In [38]:
test

Unnamed: 0,id,date,country,store,product,date_year,date_month,date_day,date_day_of_week,date_hour,date_minute,date_year_sin,date_year_cos,date_month_sin,date_month_cos
0,230130,2017-01-01,0,0,0,2017.0,1.0,1.0,6.0,0.0,0.0,-9.700862e-13,1.0,5.000000e-01,0.866025
1,230131,2017-01-01,0,0,1,2017.0,1.0,1.0,6.0,0.0,0.0,-9.700862e-13,1.0,5.000000e-01,0.866025
2,230132,2017-01-01,0,0,2,2017.0,1.0,1.0,6.0,0.0,0.0,-9.700862e-13,1.0,5.000000e-01,0.866025
3,230133,2017-01-01,0,0,3,2017.0,1.0,1.0,6.0,0.0,0.0,-9.700862e-13,1.0,5.000000e-01,0.866025
4,230134,2017-01-01,0,0,4,2017.0,1.0,1.0,6.0,0.0,0.0,-9.700862e-13,1.0,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98545,328675,2019-12-31,5,1,0,2019.0,12.0,31.0,1.0,0.0,0.0,3.510335e-13,1.0,-2.449294e-16,1.000000
98546,328676,2019-12-31,5,1,1,2019.0,12.0,31.0,1.0,0.0,0.0,3.510335e-13,1.0,-2.449294e-16,1.000000
98547,328677,2019-12-31,5,1,2,2019.0,12.0,31.0,1.0,0.0,0.0,3.510335e-13,1.0,-2.449294e-16,1.000000
98548,328678,2019-12-31,5,1,3,2019.0,12.0,31.0,1.0,0.0,0.0,3.510335e-13,1.0,-2.449294e-16,1.000000


In [42]:
from sklearn.model_selection import KFold, train_test_split

new_test = test[X_train.columns]
# Replace xgb_params with study.best_params
xgb_params = study.best_params
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# Add additional parameters that are not part of Optuna optimization but required
xgb_params.update({
    'device': 'gpu',                # Use GPU for training
    'enable_categorical': True,     # Enable categorical feature support
    'n_jobs': -1,                   # Use all available CPU threads
})

scores, xgb_test_preds = [], []

for i, (train_idx, val_idx) in enumerate(kfold.split(train)):
    print(f'Fold {i}')
    X_train, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]    

    xgb_model = XGBRegressor(**xgb_params)
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=500)

    y_preds = xgb_model.predict(X_val)

    mape_score = mean_absolute_percentage_error(y_val, y_preds)
    print('Mape Score is :', mape_score)
    scores.append(mape_score)
    xgb_test_preds.append(xgb_model.predict(new_test))

xgb_score = np.mean(scores)
xgb_std = np.std(scores)

print(f"Mean MAPE: {xgb_score}, Std MAPE: {xgb_std}")

Fold 0
[0]	validation_0-rmse:687.04909




[500]	validation_0-rmse:82.95207
[1000]	validation_0-rmse:62.60379
[1239]	validation_0-rmse:60.03471
Mape Score is : 0.06725231328364854
Fold 1
[0]	validation_0-rmse:685.26229




[500]	validation_0-rmse:81.36580
[1000]	validation_0-rmse:61.11079
[1239]	validation_0-rmse:58.54512
Mape Score is : 0.06767818960205661
Fold 2
[0]	validation_0-rmse:687.45004




[500]	validation_0-rmse:81.66830
[1000]	validation_0-rmse:61.67851
[1239]	validation_0-rmse:59.21199
Mape Score is : 0.0680644532089777
Fold 3
[0]	validation_0-rmse:687.91448




[500]	validation_0-rmse:84.44459
[1000]	validation_0-rmse:63.44758
[1239]	validation_0-rmse:60.66538
Mape Score is : 0.06787624519200722
Fold 4
[0]	validation_0-rmse:681.59130




[500]	validation_0-rmse:81.03479
[1000]	validation_0-rmse:61.54134
[1239]	validation_0-rmse:59.05680
Mape Score is : 0.06874128322126148
Mean MAPE: 0.06792249690159032, Std MAPE: 0.0004902109728453752


In [57]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

# Assuming df1 is your original DataFrame with a column 'SalePrice' and df2 is the new test dataset
# Ensure that df2 is loaded correctly
X1_test = test  # Load the new test set

# Align the columns of the test set with the columns of the training set (same features)
X1_test = X1_test[X_train.columns]  # Ensure same feature columns in both training and test set


# Make predictions with the trained model
test_predictions_scaled = xgb_model.predict(X1_test)

# If necessary, inverse transform the predictions (if they were scaled)
#test_predictions = scaler_Y.inverse_transform(test_predictions_scaled.reshape(-1, 1))

# Ensure 'Id' is correctly included as an integer column for saving the results

# Save the predictions to a CSV file, including the 'Id' column
predictions_df = pd.DataFrame({'id': test_org['id'], 'num_sold': test_predictions_scaled.flatten()})

#predictions_df['id'] = predictions_df['id'].astype('Int32')
predictions_df.to_csv('xgboost_predictions.csv', index=False)

print("Training on the full dataset is complete. Predictions saved!")


Training on the full dataset is complete. Predictions saved!
