#  **Drug Sales Forecasting**

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [None]:
df = pd.read_csv('../input/rossmann-store-sales/train.csv')

In [None]:
df

## Importing Nessaries Warning And Data

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [None]:
df = pd.read_csv('../input/rossmann-store-sales/train.csv')

In [None]:
df_store = pd.read_csv('../input/rossmann-store-sales/store.csv')

## Sneak Peak At Data

In [None]:
df

In [None]:
df.info()

## Setting date time as index in sorted order

In [None]:
df.sort_values(by="Date", inplace=True)
# Convert the date column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# Set the date column as the index
df.set_index('Date', inplace=True)
df

## Dropping unneccery columns

In [None]:
# Drop the 'DayOfWeek' column
df = df.drop(columns=['DayOfWeek'])

# Drop the 'Store' column
df = df.drop(columns=['Store'])

# Drop the 'Customers' column
df = df.drop(columns=['Customers'])

# Drop the 'Open' column
df = df.drop(columns=['Open'])

# Drop the 'Promo' column
df = df.drop(columns=['Promo'])

# Drop the 'StateHoliday' column
df = df.drop(columns=['StateHoliday'])

# Drop the 'SchoolHoliday' column
df = df.drop(columns=['SchoolHoliday'])

In [None]:
df.info()

## Split Test Train Dataset

In [None]:
train_size = int(len(df) * 0.8)
train_data = df[:train_size].copy(deep=True)
test_data = df[train_size:].copy(deep=True)

## Plotting sales data.

In [None]:
plt.figure(figsize=(20, 6))
df['Sales'].plot()
plt.xlabel('Day')
plt.ylabel('Sales')
plt.title('Sales over Time (Whole data)')
plt.show()

## Checking how data looks Weekly, Monthly, And yearly

In [None]:
# Resample data to weekly frequency, summing up the sales
df_weekly = df.resample('W').sum()

plt.figure(figsize=(20, 6))
df_weekly['Sales'].plot()
plt.xlabel('Day')
plt.ylabel('Sales')
plt.title('Sales over Time (Whole data)')
plt.show()

In [None]:
# Resample data to monthly frequency, summing up the sales
df_monthly = df.resample('M').sum()

plt.figure(figsize=(20, 6))
df_monthly['Sales'][:365].plot()
plt.xlabel('Day')
plt.ylabel('Sales')
plt.title('Sales over Time (Whole data)')
plt.show()

In [None]:
# Resample data to yearly frequency, summing up the sales
df_yearly = df.resample('a').sum()

plt.figure(figsize=(20, 6))
df_yearly['Sales'][:365].plot()
plt.xlabel('Day')
plt.ylabel('Sales')
plt.title('Sales over Time (Whole data)')
plt.show()

## A:- Simple Forcasting Modles

## 1. Average_method
## 2. Naive_method
## 3. Seasonal_naive
## 4. Drift_method

In [None]:
def average_method(train_data, test_data):
    ## Average method 
    average_prediction = [np.mean(train_data)] * len(test_data)
    average_pred = pd.DataFrame(average_prediction)
    average_pred.index = test_data.index
    return average_pred.squeeze()

def naive_method(train_data, test_data):
    ## Naive method
    naive_prediction = [train_data.iloc[-1]] * len(test_data)
    naive_pred = pd.DataFrame(naive_prediction)
    naive_pred.index = test_data.index
    return naive_pred.squeeze()

def seasonal_naive(train_data, test_data):
    ## Seasonal Naive
    dates = (test_data.index - np.timedelta64(1, 'Y')).values.astype('datetime64[D]')
    dates = dates + np.timedelta64(2,'D')
    seasonal_naive_prediction = train_data[train_data.index.isin(dates)].values  # Seasonal naive prediction
    
    # Resize if necessary to match the length of test_data
    if len(seasonal_naive_prediction) < len(test_data):
        seasonal_naive_prediction = np.resize(seasonal_naive_prediction, len(test_data))
    
    seasonal_naive = pd.DataFrame(seasonal_naive_prediction).set_index(test_data.index)
    return seasonal_naive.squeeze()

def drift_method(train_data, test_data):
    # Get the slope
    y_t = train_data.iloc[-1]
    m = (y_t - train_data.iloc[0]) / len(train_data)
    h = np.arange(len(test_data))
    drift_prediction = y_t + m * h
    drift_pred = pd.DataFrame(drift_prediction).set_index(test_data.index)
    return drift_pred.squeeze()

In [None]:
average_pred = average_method(train_data["Sales"],test_data["Sales"])
naiive_pred = naive_method(train_data["Sales"],test_data["Sales"])
seasonal_naive_pred = seasonal_naive(train_data["Sales"],test_data["Sales"])
drift_pred = drift_method(train_data["Sales"],test_data["Sales"])

## Simple model predictions

In [None]:
fig ,axes =plt.subplots(2,1)
fig.set_figheight(6)
fig.set_figwidth(16)

axes[0].plot(train_data["Sales"], label='Train',color='blue')
axes[0].plot(test_data["Sales"], label='Test',color='orange')

axes[0].plot(average_pred,label="Average method",color='red')
axes[0].plot(naiive_pred,label="Naive method",color='purple')
axes[0].plot(seasonal_naive_pred, label='Seasonal_Naive',color='green')
axes[0].plot(drift_pred,label='Drift',color='orchid')
axes[0].legend(loc='best')

axes[1].plot(test_data["Sales"], label='Test',color='orange')
axes[1].plot(average_pred,label="Average method",color='red')
axes[1].plot(naiive_pred,label="Naive method",color='purple')
axes[1].plot(seasonal_naive_pred, label='Seasonal_Naive',color='green')
axes[1].plot(drift_pred,label='Drift',color='orchid')
axes[1].legend(loc='best')
plt.show()

## Root mean square error

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# Define RMSPE function
def rmspe(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) / np.mean(y_true)

print(f"""
RMSPE for average method:    {rmspe( average_pred, test_data[["Sales"]])}
RMSPE for Naive method:{rmspe( naiive_pred, test_data[["Sales"]])}
RMSPE for Seasonal_Naive method:{rmspe( seasonal_naive_pred, test_data[["Sales"]])}
RMSPE for Drift method:{rmspe( drift_pred, test_data[["Sales"]])}""")

## B:- ARIMA

## ACF (Autocorrelation Function)
## PACF (Partial Autocorrelation Function)

In [None]:
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Plotting ACF and PACF
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# ACF plot
plot_acf(train_data['Sales'], ax=axes[0], lags=50)  # Change lags as needed
axes[0].set_title('ACF (Autocorrelation Function)')

# PACF plot
plot_pacf(train_data['Sales'], ax=axes[1], lags=50)  # Change lags as needed
axes[1].set_title('PACF (Partial Autocorrelation Function)')

plt.tight_layout()
plt.show()


## Making data Stationary and plot ACF & PACF again

In [None]:
# Differencing the data by 1
df_diff = df['Sales'].diff().dropna()

# Plotting the differenced data
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
df_diff.plot()
plt.xlabel('Date')
plt.ylabel('Differenced Sales')
plt.title('Differenced Sales over Time')
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt

# Plot ACF and PACF for the differenced sales data
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# ACF plot
plot_acf(df_diff, ax=axes[0], lags=50)  # Change lags as needed
axes[0].set_title('ACF (Autocorrelation Function) - Differenced Data')

# PACF plot
plot_pacf(df_diff, ax=axes[1], lags=50)  # Change lags as needed
axes[1].set_title('PACF (Partial Autocorrelation Function) - Differenced Data')

plt.tight_layout()
plt.show()

## KPSS test to check weather data is stationary.

In [None]:
from statsmodels.tsa.stattools import kpss

# Perform the KPSS test
result = kpss(df_diff, regression='c')
print('KPSS Statistic:', result[0])
print('p-value:', result[1])
for key, value in result[3].items():
    print('Critical Values:')
    print(f'   {key}, {value}')


## Training ARIMA model & Forecasting

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model with chosen p, d, q
model = ARIMA(train_data['Sales'], order=(1, 1, 1))
model_fit = model.fit()

# Output model summary
print(model_fit.summary())


In [None]:
# Forecast for the next steps equal to the length of test data
forecast = model_fit.forecast(steps=len(test_data))


In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 1)
fig.set_figheight(6)
fig.set_figwidth(16)

axes.plot(train_data["Sales"], label='Train', color='blue')
axes.plot(test_data.index, test_data["Sales"], label='Test', color='orange')
axes.plot(test_data.index, forecast, label='Forecast', color='green')

axes.set_xlabel('Date')
axes.set_ylabel('Sales')
axes.set_title('Sales Over Time: Train vs. Test vs. Forecast')
axes.legend()

plt.show()


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

def rmspe(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) / np.mean(y_true)

# Calculate RMSPE
rmspe_value = rmspe(test_data['Sales'], forecast)
print(f"RMSPE for ARIMA model: {rmspe_value}")


In [None]:
import pmdarima as pm

# Automatically fit ARIMA model using auto_arima
model = pm.auto_arima(train_data['Sales'], seasonal=False, stepwise=True, suppress_warnings=True)
print(model.summary())

# Get the best p, d, q values
p, d, q = model.order
print(f"Optimal values: p={p}, d={d}, q={q}")


In [None]:
# Forecast
forecast = model.predict(n_periods=len(test_data))

# Plotting
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 1)
fig.set_figheight(6)
fig.set_figwidth(16)

axes.plot(train_data["Sales"], label='Train', color='blue')
axes.plot(test_data.index, test_data["Sales"], label='Test', color='orange')
axes.plot(test_data.index, forecast, label='Forecast', color='green')

axes.set_xlabel('Date')
axes.set_ylabel('Sales')
axes.set_title('Sales Over Time: Train vs. Test vs. Forecast')
axes.legend()

plt.show()
