In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e1/sample_submission.csv
/kaggle/input/playground-series-s5e1/train.csv
/kaggle/input/playground-series-s5e1/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')

In [3]:
categorical = ['country', 'store', 'product']
numerical = ['num_sold', 'day', 'month', 'year', 'day_of_week']

In [4]:
from sklearn.preprocessing import OneHotEncoder

def preprocess(df, categorical, numerical):
    df = df.copy()
    df.bfill(inplace = True)
    
    df['date'] = pd.to_datetime(df['date']) #converting to pd datetime to be handled easier 
    
    #making separate columns for day, month, year, day of week
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['day_of_week'] = df['date'].dt.dayofweek #from 0 to 5
    
    df.drop(columns = ['date'], inplace = True) #dropping original date 

    encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore') #instantiate encoder
    encoded_array = encoder.fit_transform(df[categorical])
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical), index=df.index)
    df.drop(columns=categorical, inplace=True)
    df = pd.concat([df, encoded_df], axis=1)

    return df

In [5]:
train = preprocess(train, categorical, numerical)



In [6]:
import xgboost as xgb

xgb = xgb.XGBRegressor(n_estimators=600,
                       learning_rate=0.1,
                       max_depth=6,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       random_state=42,
                       n_jobs=-1,
                       verbosity=2)

In [7]:
x_train = train.drop(columns = ['id', 'num_sold'])
y_train = train['num_sold']

In [8]:
xgb.fit(x_train, y_train)

In [9]:
test = preprocess(test, categorical, numerical)
x_test = test.drop(columns  = ['id'])



In [10]:
preds = xgb.predict(x_test)

In [11]:
test['num_sold'] = preds

In [12]:
submission = pd.DataFrame({
    'id' : test['id'],
    'num_sold' : test['num_sold']
}) 

In [13]:
submission.to_csv('submission.csv', index = False)