In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

In [2]:
TRAIN_DF = pd.read_csv('train.csv')
TEST_DF = pd.read_csv('test.csv')
STORES_DF = pd.read_csv('stores.csv')
OIL_DF = pd.read_csv('oil.csv')
HOLIDAYS_DF = pd.read_csv('holidays_events.csv')
TRANSATIONS_DF = pd.read_csv('transactions.csv')

In [3]:
def get_info(df):
    print(f"Shape: {df.shape}\n")
    print(f"Columns: {df.columns.tolist()}\n")
    print(f"Missing values:\n{df.isnull().sum()}\n")
    print(f"Data types:\n{df.dtypes}\n")
    print(f"First 5 rows:\n{df.head()}\n")
    print(f"Descriptive statistics:\n{df.describe()}\n")
    
get_info(TRAIN_DF)


Shape: (3000888, 6)

Columns: ['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion']

Missing values:
id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64

Data types:
id               int64
date            object
store_nbr        int64
family          object
sales          float64
onpromotion      int64
dtype: object

First 5 rows:
   id        date  store_nbr      family  sales  onpromotion
0   0  2013-01-01          1  AUTOMOTIVE    0.0            0
1   1  2013-01-01          1   BABY CARE    0.0            0
2   2  2013-01-01          1      BEAUTY    0.0            0
3   3  2013-01-01          1   BEVERAGES    0.0            0
4   4  2013-01-01          1       BOOKS    0.0            0

Descriptive statistics:
                 id     store_nbr         sales   onpromotion
count  3.000888e+06  3.000888e+06  3.000888e+06  3.000888e+06
mean   1.500444e+06  2.750000e+01  3.577757e+02  2.602770e+00
std    8.662819e+05

For now, Lets start with a baseline without any additions as we know for sure that these other .csv files can drastically increase the score of our model.

In [4]:
num_columns = TRAIN_DF.select_dtypes(include=[np.number]).columns.tolist()
cat_columns = TRAIN_DF.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numerical columns: {num_columns}\n"
      f"Categorical columns: {cat_columns}\n")

Numerical columns: ['id', 'store_nbr', 'sales', 'onpromotion']
Categorical columns: ['date', 'family']



In [5]:
def plot_numerical_distribution(df, columns):
    for column in columns:
        plt.figure(figsize=(10, 5))
        sns.histplot(df[column], kde=True)
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()
        
def plot_categorical_distribution(df, columns):
    for column in columns:
        plt.figure(figsize=(10, 5))
        sns.countplot(data=df, x=column)
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()
        
#plot_numerical_distribution(TRAIN_DF, num_columns)
#plot_categorical_distribution(TRAIN_DF, cat_columns)
print(f'Total Rows: {TRAIN_DF.shape[0]}')

Total Rows: 3000888


In [6]:
print(f'NA Count: {TRAIN_DF.isna().sum()}')

NA Count: id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64


In [7]:
X = TRAIN_DF.drop(columns=['id', 'date', 'sales'])
y = TRAIN_DF['sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

num_columns_X = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_columns_X = X_train.select_dtypes(exclude=[np.number]).columns.tolist()



numerical_pipeline = [
    ('imputer', SimpleImputer(strategy='mean')), # fills mising values with mean
    ('scaler', StandardScaler()) # StandardScalar standardizes features
]

categorical_pipeline = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')), # fills missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown= 'ignore')) # OneHotEncoder converts categorical variables into dummy/indicator variables
]

preprocessor = ColumnTransformer(
    transformers= [
        ('num', Pipeline(numerical_pipeline), num_columns_X),
        ('cat', Pipeline(categorical_pipeline), cat_columns_X)
    ]
)

def train_baseline_model(X_train, y_train):
    model = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(
            n_estimators=500,
            random_state=42,
            tree_method = 'gpu_hist', # Use GPU for faster training
            gpu_id = 0, # Specify GPU ID if multiple GPUs are available
            n_jobs=-1 # Use all available CPU cores
        ))
    ])
    
    model.fit(X_train, y_train)
    
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')

In [8]:
print(TRAIN_DF.columns)

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion'], dtype='object')


In [11]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import root_mean_squared_log_error

mlflow.set_experiment("store-sales-forecasting")

with mlflow.start_run():
    baseline_model = train_baseline_model(X_train, y_train)
    y_pred = baseline_model.predict(X_test)
    
    y_pred_clipped = np.maximum(y_pred, 0)
    rmsle = root_mean_squared_log_error(y_test, y_pred_clipped)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    mlflow.log_param("model_type", "XGBRegressor")
    mlflow.log_metric("rmsle", rmsle)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(baseline_model, "baseline_tree_model")



In [12]:
print(f"RMSLE: {rmsle}")

RMSLE: 1.484578930356568
