In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder                                                                         
from sklego.preprocessing import RepeatingBasisFunction
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_data = "train.csv"
test_data = "test.csv"
df = pd.read_csv(train_data)
test = pd.read_csv(test_data)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        230130 non-null  int64  
 1   date      230130 non-null  object 
 2   country   230130 non-null  object 
 3   store     230130 non-null  object 
 4   product   230130 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 10.5+ MB


Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98550 entries, 0 to 98549
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       98550 non-null  int64 
 1   date     98550 non-null  object
 2   country  98550 non-null  object
 3   store    98550 non-null  object
 4   product  98550 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [4]:
df['date'] = pd.to_datetime(df['date'])
test['date'] = pd.to_datetime(test['date'])

In [5]:
df.isnull().sum()
df['num_sold'].fillna(0, inplace=True)
df.isnull().sum()

id          0
date        0
country     0
store       0
product     0
num_sold    0
dtype: int64

In [6]:
X = df.drop(['num_sold', 'id'], axis=1)
y = df['num_sold']

In [7]:
y.head()

0      0.0
1    973.0
2    906.0
3    423.0
4    491.0
Name: num_sold, dtype: float64

In [8]:
#Adding number of day to apply radial basis function
X_calendar = pd.DataFrame(index=X['date'])
X["day_of_year"] = X_calendar.index.day_of_year
X["day_of_year"].head()

0    1
1    1
2    1
3    1
4    1
Name: day_of_year, dtype: int32

In [9]:
test_calendar = pd.DataFrame(index=test['date'])
test["day_of_year"] = test_calendar.index.day_of_year
test["day_of_year"].head()

0    1
1    1
2    1
3    1
4    1
Name: day_of_year, dtype: int32

In [10]:
rbf = RepeatingBasisFunction(n_periods=12,
                         	column="day_of_year",
                         	input_range=(1,365),
                         	remainder="drop")

rbf_month = rbf.fit_transform(X)
rbf_month_test = rbf.fit_transform(test)
pd.DataFrame(rbf_month).head()
pd.DataFrame(rbf_month).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 12 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       230130 non-null  float64
 1   1       230130 non-null  float64
 2   2       230130 non-null  float64
 3   3       230130 non-null  float64
 4   4       230130 non-null  float64
 5   5       230130 non-null  float64
 6   6       230130 non-null  float64
 7   7       230130 non-null  float64
 8   8       230130 non-null  float64
 9   9       230130 non-null  float64
 10  10      230130 non-null  float64
 11  11      230130 non-null  float64
dtypes: float64(12)
memory usage: 21.1 MB


In [11]:
pd.DataFrame(rbf_month_test).head()
pd.DataFrame(rbf_month_test).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98550 entries, 0 to 98549
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       98550 non-null  float64
 1   1       98550 non-null  float64
 2   2       98550 non-null  float64
 3   3       98550 non-null  float64
 4   4       98550 non-null  float64
 5   5       98550 non-null  float64
 6   6       98550 non-null  float64
 7   7       98550 non-null  float64
 8   8       98550 non-null  float64
 9   9       98550 non-null  float64
 10  10      98550 non-null  float64
 11  11      98550 non-null  float64
dtypes: float64(12)
memory usage: 9.0 MB


In [12]:
enc = OneHotEncoder(sparse_output=False)
one_hot = enc.fit_transform(X[['country', 'store', 'product']])
one_hot_test = enc.fit_transform(test[['country', 'store', 'product']])
print(one_hot)
pd.DataFrame(one_hot).info()

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 14 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       230130 non-null  float64
 1   1       230130 non-null  float64
 2   2       230130 non-null  float64
 3   3       230130 non-null  float64
 4   4       230130 non-null  float64
 5   5       230130 non-null  float64
 6   6       230130 non-null  float64
 7   7       230130 non-null  float64
 8   8       230130 non-null  float64
 9   9       230130 non-null  float64
 10  10      230130 non-null  float64
 11  11      230130 non-null  float64
 12  12      230130 non-null  float64
 13  13      230130 non-null  float64
dtypes: float64(14)
memory usage: 24.6 MB


In [13]:
print(one_hot_test)
pd.DataFrame(one_hot_test).info()

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98550 entries, 0 to 98549
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       98550 non-null  float64
 1   1       98550 non-null  float64
 2   2       98550 non-null  float64
 3   3       98550 non-null  float64
 4   4       98550 non-null  float64
 5   5       98550 non-null  float64
 6   6       98550 non-null  float64
 7   7       98550 non-null  float64
 8   8       98550 non-null  float64
 9   9       98550 non-null  float64
 10  10      98550 non-null  float64
 11  11      98550 non-null  float64
 12  12      98550 non-null  float64
 13  13      98550 non-null  float64
dtypes: float64(14)
memory usage: 10.5 MB


In [14]:
cat = pd.DataFrame(one_hot)
cat = cat.add_prefix('cat_')
dates = pd.DataFrame(rbf_month)
dates = dates.add_prefix('dates_')

cat_test = pd.DataFrame(one_hot_test)
cat_test = cat_test.add_prefix('cat_')
dates_test = pd.DataFrame(rbf_month_test)
dates_test = dates_test.add_prefix('dates_')

In [15]:
print("Prefixed DataFrame:\n", cat)

Prefixed DataFrame:
         cat_0  cat_1  cat_2  cat_3  cat_4  cat_5  cat_6  cat_7  cat_8  cat_9  \
0         1.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    1.0   
1         1.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0   
2         1.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0   
3         1.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0   
4         1.0    0.0    0.0    0.0    0.0    0.0    1.0    0.0    0.0    0.0   
...       ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
230125    0.0    0.0    0.0    0.0    0.0    1.0    0.0    1.0    0.0    1.0   
230126    0.0    0.0    0.0    0.0    0.0    1.0    0.0    1.0    0.0    0.0   
230127    0.0    0.0    0.0    0.0    0.0    1.0    0.0    1.0    0.0    0.0   
230128    0.0    0.0    0.0    0.0    0.0    1.0    0.0    1.0    0.0    0.0   
230129    0.0    0.0    0.0    0.0    0.0    1.0    0.0    1.0    0.0    0.0   

        cat_10  ca

In [16]:
#training = pd.merge(dates, cat, how='outer',  suffixes=())
training = pd.concat([dates, cat], axis=1)
testing = pd.concat([dates_test, cat_test], axis=1)

In [17]:
training.head()
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230130 entries, 0 to 230129
Data columns (total 26 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   dates_0   230130 non-null  float64
 1   dates_1   230130 non-null  float64
 2   dates_2   230130 non-null  float64
 3   dates_3   230130 non-null  float64
 4   dates_4   230130 non-null  float64
 5   dates_5   230130 non-null  float64
 6   dates_6   230130 non-null  float64
 7   dates_7   230130 non-null  float64
 8   dates_8   230130 non-null  float64
 9   dates_9   230130 non-null  float64
 10  dates_10  230130 non-null  float64
 11  dates_11  230130 non-null  float64
 12  cat_0     230130 non-null  float64
 13  cat_1     230130 non-null  float64
 14  cat_2     230130 non-null  float64
 15  cat_3     230130 non-null  float64
 16  cat_4     230130 non-null  float64
 17  cat_5     230130 non-null  float64
 18  cat_6     230130 non-null  float64
 19  cat_7     230130 non-null  float64
 20  cat_

In [18]:
testing.head()
testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98550 entries, 0 to 98549
Data columns (total 26 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dates_0   98550 non-null  float64
 1   dates_1   98550 non-null  float64
 2   dates_2   98550 non-null  float64
 3   dates_3   98550 non-null  float64
 4   dates_4   98550 non-null  float64
 5   dates_5   98550 non-null  float64
 6   dates_6   98550 non-null  float64
 7   dates_7   98550 non-null  float64
 8   dates_8   98550 non-null  float64
 9   dates_9   98550 non-null  float64
 10  dates_10  98550 non-null  float64
 11  dates_11  98550 non-null  float64
 12  cat_0     98550 non-null  float64
 13  cat_1     98550 non-null  float64
 14  cat_2     98550 non-null  float64
 15  cat_3     98550 non-null  float64
 16  cat_4     98550 non-null  float64
 17  cat_5     98550 non-null  float64
 18  cat_6     98550 non-null  float64
 19  cat_7     98550 non-null  float64
 20  cat_8     98550 non-null  fl

In [19]:
# import XGBClassifier
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y= le.fit_transform(y)

# declare parameters
regr_xgb = XGBRegressor(colsample_bytree=0.2,
                       gamma=0.0,
                       learning_rate=0.05,
                       max_depth=6,
                       min_child_weight=1.5,
                       n_estimators=7200,
                       reg_alpha=0.9,
                       reg_lambda=0.6,
                       subsample=0.2,
                       seed=42,
                       silent=1)
            

# fit the classifier to the training data
regr_xgb.fit(training, y)



In [20]:
y_pred = regr_xgb.predict(testing)

In [25]:
pd.DataFrame(y_pred).head()
pred_result = pd.DataFrame(y_pred)

In [46]:
sample_file = "sample_submission.csv"
submission = pd.read_csv(sample_file)
submission = submission.drop(['num_sold'], axis = 1)

In [47]:
sample_submission_1 = pd.concat([submission, pred_result], axis=1)
sample_submission_1

Unnamed: 0,id,0
0,230130,41.531593
1,230131,903.434753
2,230132,743.252747
3,230133,386.475647
4,230134,476.799866
...,...,...
98545,328675,389.565460
98546,328676,2272.550781
98547,328677,1961.150757
98548,328678,1075.577881


In [48]:
answer = sample_submission_1.to_csv('answer.csv', index = False) 