# Importing Necessary Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX, SARIMAXResults

# Data Preprocessing

In [None]:
df=pd.read_csv(r"C:\Users\Arigala.Adarsh\Downloads\re1386870regardingcapstoneproject\Walmart (1).csv")

In [None]:
df

# Feature Name               &                                             Description

- **Store**           :                                  Store number
- **Date**         :                                      Week of Sales
- **Weekly_Sales** :                                      Sales for the given store in that week
- **Holiday_Flag**  :                                    If it is a holiday week
- **Temperature**    :                                    Temperature on the day of the sale
- **Fuel_Price**     :                                  Cost of the fuel in the region
- **CPI Consumer**  :                                    Price Index
- **Unemployment**    :                                   Unemployment Rate


# Exploratory Data Analysis(EDA)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated()

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
df.Store.unique()

In [None]:
df.Store.nunique()

In [None]:
df.Holiday_Flag.unique()

#### Analzing Data by using visualization

In [None]:
sns.countplot(df.Holiday_Flag)
plt.show()

In [None]:
sns.catplot(x='Holiday_Flag', y='Weekly_Sales', data=df)
plt.show()

In [None]:
fuel_price = pd.pivot_table(df, values = "Weekly_Sales", index= "Fuel_Price")
fuel_price.plot()
 

In [None]:
Temperature = pd.pivot_table(df, values = "Weekly_Sales", index= "Temperature")
Temperature.plot()

In [None]:
CPI = pd.pivot_table(df, values = "Weekly_Sales", index= "CPI")
CPI.plot()

In [None]:
Unemployment=pd.pivot_table(values="Weekly_Sales",index="Unemployment",data=df)
Unemployment.plot()

### Holiday Vs Weekly Sales of all 45 stores

In [None]:
plt.subplots(9,5, figsize=(30,60))
for i in range (1,46):
    plt.subplot(9,5,i)
    sns.regplot(x='Holiday_Flag', y='Weekly_Sales',data=df[df['Store']==i])
    plt.title(f'Store:{i}')


### Temperature Vs Weekly Sales of all 45 stores

In [None]:
plt.scatter(df['Temperature'],df['Weekly_Sales'])
plt.xlabel('Temperature')
plt.ylabel("Weekly_Sales")
plt.show()

In [None]:
plt.subplots(9,5, figsize=(30,60))
for i in range (1,46):
    plt.subplot(9,5,i)
    sns.regplot(x='Temperature', y='Weekly_Sales', data=df[df['Store']==i])
    plt.title(f'Store:{i}')

### Fuel Vs Weekly Sales of all 45 stores

In [None]:
plt.scatter(df['Fuel_Price'],df['Weekly_Sales'])
plt.xlabel('Fuel_Price')
plt.ylabel("Weekly_Sales")
plt.show()

In [None]:
plt.subplots(9,5, figsize=(30,60))
for i in range (1,46):
    plt.subplot(9,5,i)
    sns.regplot(x='Fuel_Price', y='Weekly_Sales', data=df[df['Store']==i])
    plt.title(f'Store:{i}')

### CPI Vs Weekly Sales of all 45 stores

In [None]:
plt.scatter(df["CPI"],df['Weekly_Sales'])
plt.xlabel('CPI')
plt.ylabel("Weekly_Sales")
plt.show()

In [None]:
plt.subplots(9,5, figsize=(30,60))
for i in range (1,46):
    plt.subplot(9,5,i)
    sns.regplot(x='CPI', y='Weekly_Sales', data=df[df['Store']==i])
    plt.title(f'Store:{i}')

### Unemployment Vs Weekly Sales of all 45 stores

In [None]:
plt.scatter(df['Unemployment'],df['Weekly_Sales'])
plt.xlabel('Unemployment')
plt.ylabel("Weekly_Sales")
plt.show()

In [None]:
plt.subplots(9,5, figsize=(30,60))
for i in range (1,46):
    plt.subplot(9,5,i)
    sns.regplot(x='Unemployment', y='Weekly_Sales', data=df[df['Store']==i])
    plt.title(f'Store:{i}')

In [None]:
plt.figure(figsize=(20,8))
sns.heatmap(df.corr(),annot=True)
plt.show()

# Insights on Walmart Dataset

### A) If the weekly sales are affected by the unemployment rate, if yes - which stores are suffering the most?

In [None]:
plt.figure(figsize=(18,8))
sns.scatterplot(df['Weekly_Sales'],df['Unemployment'])
plt.show()

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(df['Store'],df['Weekly_Sales'])
plt.show()

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(df['Store'],df['Unemployment'])
plt.show()

In [None]:
Total_by_store = df.groupby('Store').agg({'Weekly_Sales': 'sum', 'Unemployment': 'sum'})
Total_by_store 

In [None]:
store_correlation=df.groupby('Store')['Weekly_Sales','Unemployment'].corr()['Unemployment']
print(store_correlation.idxmin())

- Store **12,28,38** has more unempoloyment i.e 1875.657.
- Unemployment effect is showing more on Store **38**.  

### B) If the weekly sales show a seasonal trend, when and what could be the reason?


In [None]:
#seasonal Decompose
result=seasonal_decompose(df['Weekly_Sales'],model='additive', period=7)

result.plot()
plt.show()

In [None]:
#Addictive
new_df_add=pd.concat([result.seasonal,result.trend,result.resid,result.observed],axis=1)
new_df_add.columns=["seasonality","trend","residual","actual_values"]
new_df_add.head(5)


In [None]:
result=seasonal_decompose(df['Weekly_Sales'],model='multiplicative', period=7)
result.plot()
plt.show()

In [None]:
#Addictive
new_df_mul=pd.concat([result.seasonal,result.trend,result.resid,result.observed],axis=1)
new_df_mul.columns=["seasonality","trend","residual","actual_values"]
new_df_mul.head(5)


In [None]:
sns.distplot(df['Weekly_Sales'],hist=True)
plt.plot()

 - Weekly sales show a seasonal trend... 

In [None]:
df.Holiday_Flag.value_counts()

In [None]:
Total_by_store = df.groupby(['Holiday_Flag'],as_index=False).agg({'Weekly_Sales': 'sum'})
Total_by_store 

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(Total_by_store['Holiday_Flag'],Total_by_store['Weekly_Sales'])
plt.show()

- Here we can observe that on  Non Holiday Week -Sales is more than the  Holiday Week

In [None]:
Total_by_store = df.groupby(['Store','Holiday_Flag'],as_index=False).agg({'Weekly_Sales': 'sum','Temperature':"mean"})
Total_by_store 

In [None]:
Total_by_store[Total_by_store["Weekly_Sales"]==Total_by_store["Weekly_Sales"].max()] 

In [None]:
Total_by_store[Total_by_store["Weekly_Sales"]==Total_by_store["Weekly_Sales"].min()]

-We can observe that on Sotre 20 Non Holiday Weekday sales is more and temperature is less
-On Holiday Weekday Sales is less and Temperature is More.

### C) Does temperature affect the weekly sales in any manner?

In [None]:
plt.figure(figsize=(18,8))
sns.scatterplot(df['Temperature'],df['Weekly_Sales'])
plt.show()

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(df['Store'],df['Temperature'])
plt.show()

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(df['Store'],df['Weekly_Sales'])
plt.show()

In [None]:
plt.subplots(9,5, figsize=(30,50))
for i in range (1,46):
    plt.subplot(9,5,i)
    sns.regplot(x='Temperature', y='Weekly_Sales', data=df[df['Store']==i])
    plt.title(f'Store:{i}')

In [None]:
Total_by_temp = df.groupby('Temperature',as_index=False).agg({'Weekly_Sales': 'sum'})
Total_by_temp 

In [None]:
plt.plot(Total_by_temp['Temperature'],Total_by_temp['Weekly_Sales'])
plt.show()

In [None]:
Total_by_store = df.groupby(['Store'], as_index=False).agg({'Weekly_Sales': 'sum', 'Temperature': 'mean'})
sns.lineplot(x='Temperature', y='Weekly_Sales', data=Total_by_store)

- Temperature affects we can observer above visualization at High Weekly Sales at lower temperature
- At Low Weekly Sales at higher temperature

### D) How is the Consumer Price index affecting the weekly sales of various stores?


In [None]:
plt.scatter(df["CPI"],df['Weekly_Sales'])
plt.xlabel('CPI')
plt.ylabel("Weekly_Sales")
plt.show()

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(df['Store'],df['CPI'])
plt.show()

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(df['Store'],df['Weekly_Sales'])
plt.show()

In [None]:
Total_by_store = df.groupby('Store').agg({'Weekly_Sales': 'sum', 'CPI': 'sum'})
Total_by_store 

In [None]:
Total_by_store[Total_by_store.CPI==Total_by_store.CPI.max()]

In [None]:
Total_by_store[Total_by_store.CPI==Total_by_store.CPI.min()]

- Consumer Index(CPI)  affects we can observer above visualization at High Weekly Sales CPI is low.
- At Low Weekly Sales CPI is high.

### E) Top performing stores according to the historical data.


In [None]:
Total_data_by_store=df.groupby('Store',as_index=False).agg({"Weekly_Sales":"sum"})
Top_sales_store=Total_data_by_store.sort_values(by='Weekly_Sales',ascending=False)
Top_sales_store

In [None]:
top_5_stores=Top_sales_store.head()
top_5_stores.reset_index(drop=True)

- From above result we can observe the top performing stores.

### F) The worst performing store, and how significant is the difference between the highest and lowest performing stores.

In [None]:
Total_data_by_store=df.groupby('Store',as_index=False).agg({"Weekly_Sales":"sum","Temperature":"mean","CPI":"sum","Unemployment":"mean"})
Total_data_by_store

In [None]:
Lowest_Store=Total_data_by_store.loc[(Total_data_by_store.Weekly_Sales==Total_data_by_store.Weekly_Sales.min())] 
Lowest_Store

In [None]:
Highest_Store=Total_data_by_store.loc[(Total_data_by_store.Weekly_Sales==Total_data_by_store.Weekly_Sales.max())] 
Highest_Store

-- From above analysis Temperatue & Unemployement on lowest store is high that effects the sales

In [None]:
# Extract the 'Weekly_Sales' values for the lowest and highest stores
lowest_sales = Lowest_Store['Weekly_Sales'].values[0]
highest_sales = Highest_Store['Weekly_Sales'].values[0]

# Calculate the difference
difference = highest_sales - lowest_sales

print("Worst Performing Store:", Lowest_Store['Store'].values[0])
print("Best Performing Store:",Highest_Store['Store'].values[0])
print("Difference between the highest and lowest store sales:", difference)


In [None]:
#two sample t-test for unequal variances
from scipy.stats import ttest_ind

t_stat_2, p_val_2 = ttest_ind(Highest_Store['Store'].values[0],Lowest_Store['Store'].values[0], equal_var=False)
print(t_stat_2, p_val_2)

In [None]:
df.dtypes

##### **Converting Date data_type(object ) to datetime(datetime64) data_types**

In [None]:
df.Date=pd.to_datetime(df.Date,format="%d-%m-%Y")

In [None]:
df.Date.dtypes

In [None]:
df=df.set_index("Date")

In [None]:
df

In [None]:
df.shape

# Time Series Analysis

In [None]:
df_target=df.loc[:,['Store',"Weekly_Sales"]]

In [None]:
df_target

In [None]:
df_target.dtypes

In [None]:
from pmdarima import auto_arima
model = auto_arima(df["Weekly_Sales"], seasonal=True, stepwise=True, trace=True)

### Store_1 Dataset

In [None]:
result=adfuller(df_target[df_target['Store']==1]['Weekly_Sales']) 
result

#### From the above, we see that that p value is close to 0 and we can conclude that the data is stationary


In [None]:
p_value=result[1]
p_value
if p_value <=0.05:
    print('Stationarity is present')
else:
    print('NO Stationarity is present')

In [None]:
store1=pd.DataFrame(df_target[df_target['Store']==1]['Weekly_Sales'])
store1=store1.sort_index()
store1

#### Visualizing Seasonality 

In [None]:
decompose=seasonal_decompose(store1.dropna())
decompose_plot=decompose.plot()

In [None]:
acf_plot=acf(store1["Weekly_Sales"])

In [None]:
plot_acf(acf_plot)
plt.show()

In [None]:
plot_pacf(store1["Weekly_Sales"],lags=20)
plt.show()

In [None]:
store1['lag1']=store1['Weekly_Sales'].diff()

In [None]:
store1['lag52']=store1['Weekly_Sales'].diff(52)

In [None]:
pacf_values=plot_pacf(store1['lag52'].dropna(),lags=20)

In [None]:
acf_values=plot_acf(store1['lag52'].dropna(),lags=20)

In [None]:
# From the above, pacf(p) ,acf(q) value is 0
pacf_values=sm.tsa.pacf(store1['Weekly_Sales'],nlags=20)
pacf_values

- p[1]=0.30 value will be near to zero then p=0 

In [None]:
train = store1.iloc[:116]['Weekly_Sales']
test = store1.iloc[117:]['Weekly_Sales']
 

#### Training the Model

In [None]:
model=SARIMAX(train,order=(0,1,0),seasonal_order=(0,1,0,52))
model_fit = model.fit()
model_fit.summary()

### Testing the Model

In [None]:
pred = model_fit.predict(start= len(train),end=len(train)+len(test)- 1,dynamic=True)
 

In [None]:
pred
 

In [None]:
plt.figure(figsize=(20,10))
plt.plot(store1['Weekly_Sales'],label="Original")
plt.plot(pred,label="Prediction")
plt.legend()
plt.show()

#### Forecasting for Next 12 Weekly_Sales

In [None]:
forecast=model_fit.forecast(steps=12)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(store1['Weekly_Sales'],label="Original")
plt.plot(pred,label="Prediction")
plt.plot(forecast,label="Forecast")
plt.legend()
plt.show()

In [None]:
# Assuming 'test' contains the actual values from your test dataset
comparison = pd.DataFrame({'Actual': test, 'Predicted': pred})
comparison.dropna(inplace=True)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(comparison['Actual'], comparison['Predicted'])
mse = mean_squared_error(comparison['Actual'], comparison['Predicted'])
rmse = np.sqrt(mse)
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


In [None]:
### Store_2 Dataset

In [None]:
result=adfuller(df_target[df_target['Store']==2]['Weekly_Sales']) 
result

In [None]:
#### From the above, we see that that p value is close to 0 and we can conclude that the data is stationary


In [None]:
p_value=result[1]
p_value
if p_value <=0.05:
    print('Stationarity is present')
else:
    print('NO Stationarity is present')

In [None]:
store2=pd.DataFrame(df_target[df_target['Store']==2]['Weekly_Sales'])
store2=store2.sort_index()
store2

#### Visualizing Seasonality 

In [None]:
decompose=seasonal_decompose(store2.dropna())
decompose.plot()

In [None]:
plot_pacf(store2["Weekly_Sales"],lags=20)
plt.show()

In [None]:
store2['lag1']=store1['Weekly_Sales'].diff()
store2['lag52']=store1['Weekly_Sales'].diff(52)

In [None]:
pacf_values=plot_pacf(store1['lag52'].dropna(),lags=20)

In [None]:
pacf_values=sm.tsa.pacf(store2['Weekly_Sales'],nlags=20)
pacf_values

- p[1]=0.38 value will be near to zero then p=0 

In [None]:
# From the above, pacf(p) ,acf(q) value is 0

acf_values=plot_acf(store2['lag52'].dropna(),lags=20)

In [None]:
 
train = store2.iloc[:round(len(store2)*0.7)]['Weekly_Sales']
 

#### Training the Model

In [None]:
model=SARIMAX(train,order=(0,1,0),seasonal_order=(0,1,0,52))
model_fit = model.fit()
model_fit.summary()

#### Testing the Model

In [None]:
pred = model_fit.predict(start= len(train),end=len(store2)- 1,dynamic=True)
 

In [None]:
pred

In [None]:
plt.figure(figsize=(20,10))
plt.plot(store2['Weekly_Sales'],label="Original")
plt.plot(pred,label="Predicition")
plt.legend(loc="best")
plt.show()

#### Forecasting for Next 12 Weekly_Sales

In [None]:
forecast=model_fit.forecast(steps=12)

In [None]:
plt.figure(figsize=(20,10)) 
plt.plot(store2['Weekly_Sales'],label="Original")
plt.plot(pred,label="Prediction")
plt.plot(forecast,label="Forecast")
plt.legend()
plt.show()

In [None]:
### Store_3 Dataset

In [None]:
result=adfuller(df_target[df_target["Store"]==3]['Weekly_Sales'])
result

In [None]:
#### From the above, we see that that p value is close to 0 and we can conclude that the data is stationary


In [None]:
if(result[1]<0.05):
    print("Stationary is present")
else:
    print("Not Stationary is present")

In [None]:
store3=pd.DataFrame(df_target[df_target["Store"]==3]["Weekly_Sales"])
store3

#### Visualizing Seasonality 

In [None]:
decompose=seasonal_decompose(store3.dropna())
decompose_plot=decompose.plot()

In [None]:
plot_pacf=plot_pacf(store3,lags=20)

In [None]:
pacf_values=sm.tsa.pacf(store3)
pacf_values

- p[1]=0.53 value will be near to zero then p=0 

In [None]:
store3['lag1']=store3['Weekly_Sales'].diff()
store3['lag52']=store3['Weekly_Sales'].diff(52)

In [None]:
# From the above, pacf(p) ,acf(q) value is 0

acf_values = plot_acf(store3['lag52'].dropna(), lags=20)


In [None]:
 
train = store3.iloc[:round(len(store2)*0.7)]['Weekly_Sales']
 

#### Training the Model

In [None]:
model=SARIMAX(train,order=(0,1,0),seasonal_order=(0,1,0,52))
model_fit=model.fit()
model_fit.summary()

#### Testing the Model

In [None]:
pred=model_fit.predict(start=len(train),end=len(store3)-1,dynamic=True)
 
pred

In [None]:
plt.figure(figsize=(20,10))
plt.plot(store3['Weekly_Sales'],label="Original")
plt.plot(pred,label="prediction")
plt.legend()
plt.plot()

#### Forecasting for Next 12 Weekly_Sales

In [None]:
forecast=model_fit.forecast(steps=12)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(store3['Weekly_Sales'],label="Original")
plt.plot(pred,label="prediction")
plt.plot(forecast,label="Forecast")
plt.legend()
plt.show()

In [None]:
### Store_4 Dataset

In [None]:
result=adfuller(df_target[df_target["Store"]==3]['Weekly_Sales'])
result

In [None]:
#### From the above, we see that that p value is close to 0 and we can conclude that the data is stationary


In [None]:
if(result[1]<0.05):
    print("Stationary is present")
else:
    print("Not Stationary is present")

In [None]:
store4=pd.DataFrame(df_target[df_target["Store"]==4]["Weekly_Sales"])
store4

#### Visualizing Seasonality 

In [None]:
decompose=seasonal_decompose(store4.dropna())
decompose_plot=decompose.plot()

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf  # Import the plot_pacf function
store4["lag1"] = store4["Weekly_Sales"].diff()
store4["lag52"] = store4["Weekly_Sales"].diff(52)

# Assign the result to a different variable, e.g., pacf_values
pacf_values = plot_pacf(store4["Weekly_Sales"].dropna(), lags=20)


In [None]:
import statsmodels.api as sm
pacf_values = sm.tsa.pacf(store4['Weekly_Sales'])
pacf_values

- p[1]=0.426 value will be near to zero then p=0 

In [None]:
# From the above, pacf(p) ,acf(q) value is 0

acf_values = plot_acf(store4['lag52'].dropna(), lags=20)


In [None]:
 
train = store4.iloc[:round(len(store4)*0.7)]['Weekly_Sales']
 

#### Training the Model

In [None]:
model=SARIMAX(train,order=(0,1,0),seasonal_order=(0,1,0,52))
model.fit=model.fit()
model_fit.summary()

#### Testing the Model

In [None]:
pred=model_fit.predict(start=len(train),end=len(store4)-1,dynamic=True)
 
pred

In [None]:
plt.figure(figsize=(20,10))
plt.plot(store4['Weekly_Sales'],label="Original")
plt.plot(pred,label="prediction")
plt.legend()
plt.show()

#### Forecasting for Next 12 Weekly_Sales

In [None]:
forecast=model_fit.forecast(steps=12)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(store4['Weekly_Sales'],label="Original")
plt.plot(pred,label="prediction")
plt.plot(forecast,label="Forecast")
plt.legend()
plt.show()

# Foresting Week_Sales for 45 Stores

In [None]:
for i in range(1,46):
    new_data=pd.DataFrame(df[df["Store"]==i]["Weekly_Sales"])
    lag52=pd.DataFrame(new_data["Weekly_Sales"].diff(52))
    acf_values,confidence_intervals=sm.tsa.acf(lag52.dropna(),nlags=20,alpha=0.05)
    pacf_values=sm.tsa.pacf(lag52.dropna(),nlags=20)
    significant_acf = []
    significant_pacf = []
    for lag,acf,confident in zip(range(len(acf_values)),acf_values,confidence_intervals):
        if(abs(acf)>confident[1]):
            significant_acf.append(acf)
        else:
            break
    for lag,pacf,confident in zip(range(len(pacf_values)),pacf_values,confidence_intervals):
        if(abs(acf)>confident[1]):
            significant_pacf.append(acf)
        else:
            break
    p=len(significant_acf)
    q=len(significant_pacf)
    train=new_data[:round(len(new_data)*0.7)]
    model=SARIMAX(train,order=(p,1,q),seasonal_order=(p,1,q,52))
    model_fit=model.fit()
    pred=model_fit.predict(start=len(train),end=len(new_data)-1,dynamic=True)
    forecast=model_fit.forecast(steps=12)
    plt.figure(figsize=(20,10))
    plt.plot(new_data['Weekly_Sales'],label='Given_sales')
    plt.plot(pred,label='Prediction')
    plt.plot(forecast,label="12 weeks forecast")
    plt.legend()
    plt.title(f'store {i}')
    plt.show()
    
  


- **From the Above visualization we can observe the next 12 weeks sales of the 45 Stores of Walmart.**