In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import xgboost as xgb 


In [2]:
folder_path = "C:\\Users\\user\\Documents\\DATASETS\\store-sales-time-series-forecasting"


In [3]:
# Load all CSV files
holidays = pd.read_csv(f"{folder_path}\\holidays_events.csv")
oil = pd.read_csv(f"{folder_path}\\oil.csv")
stores = pd.read_csv(f"{folder_path}\\stores.csv")
sample_submission = pd.read_csv(f"{folder_path}\\sample_submission.csv")
test = pd.read_csv(f"{folder_path}\\test.csv")
train = pd.read_csv(f"{folder_path}\\train.csv")
transactions = pd.read_csv(f"{folder_path}\\transactions.csv")


In [4]:
print(holidays.head())
print(oil.head())
print(stores.head())
print(sample_submission.head())
print(test.head())
print(train.head())
print(transactions.head())

         date     type    locale locale_name                    description  \
0  2012-03-02  Holiday     Local       Manta             Fundacion de Manta   
1  2012-04-01  Holiday  Regional    Cotopaxi  Provincializacion de Cotopaxi   
2  2012-04-12  Holiday     Local      Cuenca            Fundacion de Cuenca   
3  2012-04-14  Holiday     Local    Libertad      Cantonizacion de Libertad   
4  2012-04-21  Holiday     Local    Riobamba      Cantonizacion de Riobamba   

   transferred  
0        False  
1        False  
2        False  
3        False  
4        False  
         date  dcoilwtico
0  2013-01-01         NaN
1  2013-01-02       93.14
2  2013-01-03       92.97
3  2013-01-04       93.12
4  2013-01-07       93.20
   store_nbr           city                           state type  cluster
0          1          Quito                       Pichincha    D       13
1          2          Quito                       Pichincha    D       13
2          3          Quito                  

In [15]:
print(train.isnull().sum())
print(test.isnull().sum())
print(holidays.isnull().sum())
print(oil.isnull().sum())
print(stores.isnull().sum())
print(sample_submission.isnull().sum())
print(transactions.isnull().sum())

id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64
id             0
date           0
store_nbr      0
family         0
onpromotion    0
dtype: int64
date           0
type           0
locale         0
locale_name    0
description    0
transferred    0
dtype: int64
date          0
dcoilwtico    0
dtype: int64
store_nbr    0
city         0
state        0
type         0
cluster      0
dtype: int64
id       0
sales    0
dtype: int64
date            0
store_nbr       0
transactions    0
dtype: int64


In [10]:
# Remove rows where 'dcoilwtico' has missing values
oil = oil.dropna(subset=['dcoilwtico'])

# Check the data after removing the missing values
print(oil.head())


         date  dcoilwtico
1  2013-01-02       93.14
2  2013-01-03       92.97
3  2013-01-04       93.12
4  2013-01-07       93.20
5  2013-01-08       93.21


In [14]:
# Strip any extra spaces from column names
oil.columns = oil.columns.str.strip()

oil['dcoilwtico'] = oil['dcoilwtico'].fillna(oil['dcoilwtico'].mean())

print(train.head())


   id        date  store_nbr      family  sales  onpromotion
0   0  2013-01-01          1  AUTOMOTIVE    0.0            0
1   1  2013-01-01          1   BABY CARE    0.0            0
2   2  2013-01-01          1      BEAUTY    0.0            0
3   3  2013-01-01          1   BEVERAGES    0.0            0
4   4  2013-01-01          1       BOOKS    0.0            0


In [16]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [17]:
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['weekday'] = train['date'].dt.weekday

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['weekday'] = test['date'].dt.weekday


In [23]:
# Select the features for modeling (you may need to adjust this depending on the dataset)
features = ['year', 'month', 'day', 'weekday', 'onpromotion', 'store_nbr']
target = 'sales'
X_train = train[features]
y_train = train[target]
X_test = test[features]


In [24]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Train-test split (using a portion of the data for validation)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# XGBoost model setup
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05)

# Train the model
model.fit(X_train_split, y_train_split)

# Predictions and evaluation
y_pred = model.predict(X_val_split)
mae = mean_absolute_error(y_val_split, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 318.3441326207091


In [25]:
y_test_pred = model.predict(X_test)

In [37]:
# Save final predictions to a CSV file
submission = pd.DataFrame({'id': test['id'], 'sales': y_test_pred})
submission.to_csv("submission.csv", index=False)


In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split


In [30]:
# Select the features for modeling (you may need to adjust this depending on the dataset)
features = ['year', 'month', 'day', 'weekday', 'onpromotion', 'store_nbr']
target = 'sales'
X_train = train[features]
y_train = train[target]
X_test = test[features]

In [34]:
# Initialize Linear Regression model
lr_model = LinearRegression()

# Fit the model
lr_model.fit(X_train, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test)

# Calculate Mean Absolute Error
lr_mae = mean_absolute_error(y_train, lr_model.predict(X_train))
print(f"Linear Regression MAE: {lr_mae}")


Linear Regression MAE: 434.14093856601477


In [36]:
# Cross-validation for Linear Regression
lr_cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(f"Linear Regression Cross-validation MAE: {-lr_cv_scores.mean()}")


Linear Regression Cross-validation MAE: 451.86616011902163
