In [34]:
from statsmodels.tsa.deterministic import DeterministicProcess
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.signal import periodogram
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier # For modeling trends and seasonal features

In [35]:
# Datasets
train = pd.read_csv('data/train.csv')
stores = pd.read_csv('data/stores.csv')

print('TRAINING DATA \n')
print(train.info())
print('\n')
print('STORES METADATA \n')
print(stores.info())

TRAINING DATA 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB
None


STORES METADATA 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   store_nbr  54 non-null     int64 
 1   city       54 non-null     object
 2   state      54 non-null     object
 3   type       54 non-null     object
 4   cluster    54 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 2.2+ KB
None


In [36]:
# convert date strings to datetime objects
train['date'] = pd.to_datetime(train['date'])

In [37]:
df = pd.merge(
    train,
    stores,
    on = 'store_nbr',
    how = 'left'
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 10 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
 6   city         object        
 7   state        object        
 8   type         object        
 9   cluster      int64         
dtypes: datetime64[ns](1), float64(1), int64(4), object(4)
memory usage: 228.9+ MB


In [38]:
# Drop id feature
df = df.drop(columns={'id'},axis=1)

In [39]:
categorical_features = ['store_nbr','family','onpromotion','city','state','type','cluster']
# numerical_features = ['sales']

In [43]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocessor = ColumnTransformer(
    transformers =[
        #('num',StandardScaler(), numerical_features),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_features)
    ]
)

model_pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
    ])

In [44]:
X = df.drop('sales',axis=1)
y = df['sales']

from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(X,y, test_size=0.3, random_state=57)

In [45]:
model_pipeline.fit(X_train,y_train)
predictions = model_pipeline.predict(X_cv)

In [48]:
predictions[predictions < 0]

array([-145.21610549, -103.66157502, -107.10261945, ...,  -99.69096292,
       -121.97906465,  -97.69776105], shape=(401081,))

In [46]:
from sklearn.metrics import root_mean_squared_log_error

rmsle = root_mean_squared_log_error(y_cv,predictions)
print(f'RMSLE: {rmsle: .4f}')

ValueError: Root Mean Squared Logarithmic Error cannot be used when targets contain values less than or equal to -1.

---

In [None]:


tot_sales = tot_sales.reset_index()  
tot_sales['date'] = pd.to_datetime(tot_sales['date'])
tot_sales = tot_sales.set_index('date').sort_index()

dp = DeterministicProcess(
    index = tot_sales.index,
    constant=True,
    order=1, # Linear trend
    seasonal=True,
    period=365, # Yearly seasonality
    drop=True
)

# Training features
X_train = dp.in_sample()
y = tot_sales['sales']

# Fit the linear model
model = LinearRegression()
model.fit(X_train,y)

# Make predictions on training data, for evaluation
y_pred = model.predict(X_train)

# Calculate training error (root mean squared error)
rmsle = np.sqrt(mean_squared_log_error(y,y_pred))
print(f'Training RMSLE: {rmsle:.2f}')

# Out-of-sample predictions
forecast_steps = 16
X_forecast = dp.out_of_sample(steps=forecast_steps)
forecast = model.predict(X_forecast)

# forecast dataframe
last_date = tot_sales.index[-1]
forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), 
                                periods=forecast_steps, 
                                freq='D')

forecast_df = pd.DataFrame({
    'date' : forecast_dates,
    'forecast': forecast
})

print("\nForecast:")
print(forecast_df)