# **Importing Necessary Libraries and Data**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from prophet import Prophet
import matplotlib.dates as mdates



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_store = pd.read_csv('/kaggle/input/wallmart-sales-forecast-datasets/stores.csv') #store data
df_train = pd.read_csv('/kaggle/input/wallmart-sales-forecast-datasets/train.csv') # train set
df_features = pd.read_csv('/kaggle/input/wallmart-sales-forecast-datasets/features.csv') #external information
df_test = pd.read_csv('/kaggle/input/wallmart-sales-forecast-datasets/test.csv') #test

# First Look to Data and Merging Dataframes

In [None]:
df_store.info()
df_train.info()
df_features.info()
df_test.info()

In [None]:
df_train['Date'] = pd.to_datetime(df_train['Date'])
df_test['Date'] = pd.to_datetime(df_test['Date'])
df_features['Date'] = pd.to_datetime(df_features['Date'])

df_train['Store'] = df_train['Store'].astype(int)
df_test['Store'] = df_test['Store'].astype(int)
df_features['Store'] = df_features['Store'].astype(int)
df_store['Store'] = df_store['Store'].astype(int)

In [None]:
# merging 4 different sets
df = df_train.merge(df_features, on=['Store', 'Date'], how='left')\
             .merge(df_store, on='Store', how='left')\
             .merge(df_test, on=['Store', 'Date', 'Dept'], how='left')
df.head(5)

In [None]:
df['IsHoliday_'] = df['IsHoliday_x'].combine_first(df['IsHoliday_y']).combine_first(df['IsHoliday'])
df.drop(columns=['IsHoliday_x', 'IsHoliday_y', 'IsHoliday'], inplace=True)
df.head(5)

In [None]:
df.loc[df['Weekly_Sales']<=0]

In [None]:
df = df.loc[df['Weekly_Sales'] > 0]
df.shape

In [None]:
df.describe()

In [None]:
#weekly ave sales 
storemean=df.groupby(by='Store').mean(numeric_only=True)
storemean.head(3)

# Exploratory Data Analysis (EDA)

In [None]:
colors = ['#FEB59A', '#CE3E12', '#7B6C28', '#4F6D7A', '#C0D6DF']

In [None]:
monthly_sales_by_storetype = pd.pivot_table(df, values="Weekly_Sales", columns="Type", index="Date")

monthly_sales_by_storetype.plot(color=colors, figsize=(14, 6)) 
plt.title("Sales By Store Type", fontsize=16)
plt.ylabel("")
plt.xlabel("")
sns.despine()  
plt.tight_layout()
plt.show()

**Findings:** 

* Clear seasonality around holidays (notably Thanksgiving and Christmas)
* Store Type A consistently outperformed B and C in sales
* Store size correlates with type, affecting capacity and reach

In [None]:
plt.figure(figsize=(14, 6)) 

df_clean = df.dropna(subset=['Weekly_Sales', 'Dept'])
sns.barplot(x="Dept", y="Weekly_Sales", data=df_clean, palette=colors, estimator='mean')

plt.ylabel("")
plt.xlabel("")
sns.despine()  
plt.title('Mean Sales by Department', fontsize=16)
plt.tight_layout()
plt.show()

**Findings:**

* Certain departments (e.g., electronics, groceries) generate higher sales
* Opportunities to optimize underperforming departments

In [None]:
markdown_by_date = df[['Date', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].copy()
markdown_by_date = markdown_by_date.dropna().groupby('Date').sum()


markdown_by_date.plot(color=colors, figsize=(15,6), alpha=0.9)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))  
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))  
plt.title("MarkDown Trends Over Time", fontsize=16)
plt.xlabel("")
plt.ylabel("")
plt.grid(False)
sns.despine()
plt.tight_layout()
plt.show()

**Findings:**

* Spikes in markdowns are often followed by increases in sales
* Suggests markdowns are used strategically around sales events and holidays

In [None]:
df_temp = df[['Temperature', 'Weekly_Sales']].copy()

df_temp = df_temp.sort_values(by='Temperature')

df_temp_rolled = df_temp.rolling(window=20000).mean()

# Шаг 4: строим график
df_temp_rolled.plot(x='Temperature', y='Weekly_Sales', color=colors, figsize=(15,6))
plt.title("Sales By Temperature", fontsize=16)
plt.xlabel("")
plt.ylabel("")
plt.grid(False)
sns.despine()  
plt.tight_layout()
plt.show()

**Findings:**

* Mild temperatures (around 40–50°F and 70–80°F) are associated with higher weekly sales, suggesting that moderate weather encourages more in-store shopping and holidays
* Extremely cold or hot temperatures (below 30°F or above 85°F) often coincide with reduced sales, possibly due to customers avoiding travel or adverse weather conditions

In [None]:
# Подготовка данных
df_temp = df[['CPI', 'Date', 'Type']].dropna()
df_temp['Date'] = pd.to_datetime(df_temp['Date'])
df_temp = df_temp.sort_values(by='Date')
df_temp.set_index('Date', inplace=True)


smoothed = pd.DataFrame()

for t in df_temp['Type'].unique():
    group = df_temp[df_temp['Type'] == t]
    numeric = group[['CPI']].rolling(window=30, min_periods=1).mean()
    numeric['Type'] = t
    smoothed = pd.concat([smoothed, numeric])


smoothed = smoothed.reset_index()


plt.figure(figsize=(20,8))

sns.lineplot(data=smoothed, x='Date', y='CPI', hue='Type', palette=colors, linewidth=2, alpha=0.6)

for t, color in zip(smoothed['Type'].unique(), colors):
    subset = smoothed[smoothed['Type'] == t].dropna()
    
    X = subset['Date'].map(pd.Timestamp.toordinal).values.reshape(-1, 1)
    y = subset['CPI'].values

    model = LinearRegression().fit(X, y)
    y_pred = model.predict(X)
    k = model.coef_[0]
    b = model.intercept_

    plt.plot(subset['Date'], y_pred, '--', color=color, linewidth=2)


    x_pos = subset['Date'].iloc[-1]
    y_pos = y_pred[-1]
    plt.text(x_pos, y_pos, f'{t}: y={k:.5f}x+{b:.5f}', fontsize=10, color=color)

plt.title("Smoothed CPI with Trend by Store Type", fontsize=16)
plt.xlabel("")
plt.ylabel("")
plt.grid(False)
plt.legend(loc='upper left')
sns.despine()
plt.tight_layout()
plt.show()

**Findings:**

* The overall CPI trend is similar across store types, indicating that inflation and economic factors impact all store categories in comparable ways
* For Store Types A and B, there is a clear positive trend — the Consumer Price Index gradually increases over time, suggesting these store types are more sensitive to economic changes
* In contrast, Store Type C shows a less pronounced or nearly flat trend, possibly reflecting more stable conditions, differences in store size, or departmental composition within this type

In [None]:

df_temp = df[['Unemployment', 'Date', 'Type']].dropna()
df_temp['Date'] = pd.to_datetime(df_temp['Date'])
df_temp = df_temp.sort_values(by='Date')
df_temp.set_index('Date', inplace=True)

smoothed = pd.DataFrame()

for t in df_temp['Type'].unique():
    group = df_temp[df_temp['Type'] == t]
    numeric = group[['Unemployment']].rolling(window=30, min_periods=1).mean()
    numeric['Type'] = t
    smoothed = pd.concat([smoothed, numeric])


smoothed = smoothed.reset_index()


plt.figure(figsize=(20,8))

sns.lineplot(data=smoothed, x='Date', y='Unemployment', hue='Type', palette=colors, linewidth=2, alpha=0.6)

for t, color in zip(smoothed['Type'].unique(), colors):
    subset = smoothed[smoothed['Type'] == t].dropna()
    
    
    X = subset['Date'].map(pd.Timestamp.toordinal).values.reshape(-1, 1)
    y = subset['Unemployment'].values

    model = LinearRegression().fit(X, y)
    y_pred = model.predict(X)
    k = model.coef_[0]
    b = model.intercept_

    plt.plot(subset['Date'], y_pred, '--', color=color, linewidth=2)

    x_pos = subset['Date'].iloc[-1]
    y_pos = y_pred[-1]
    plt.text(x_pos, y_pos, f'{t}: y={k:.5f}x+{b:.5f}', fontsize=10, color=color)

plt.title("Smoothed Unemployment with Trend by Store Type", fontsize=16)
plt.xlabel("")
plt.ylabel("")
plt.grid(False)
plt.legend(loc='upper left')
sns.despine()
plt.tight_layout()
plt.show()

**Findings:**

* Overall, the unemployment rate fluctuates but exhibits clear differences in trends across store types, generally showing a downward trend
* Store Types A and B display a nearly flat yet slightly downward trend, indicating relatively stable and improving unemployment conditions for these segments
* In contrast, Store Type C shows a pronounced downward trend, possibly reflecting stronger improvements in local employment conditions for this category

In [None]:
df_prophet = df[['Date', 'Weekly_Sales']].copy()
df_prophet = df_prophet.groupby('Date').sum().reset_index()
df_prophet.columns = ['ds', 'y']


model = Prophet()
model.fit(df_prophet)

future = model.make_future_dataframe(periods=365)
forecast = model.predict(future)


plt.figure(figsize=(14, 6))
main_color = "#4F6D7A"  

plt.plot(forecast['ds'], forecast['yhat'], color=main_color, label='Прогноз')

plt.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'],
                 color=main_color, alpha=0.2, label='Интервал')


plt.title("Sales Forecast for the Year", fontsize=16)
plt.xlabel("")
plt.ylabel("")
plt.grid(False)
sns.despine()
plt.legend()
plt.tight_layout()
plt.show()

**Findings**

* The Prophet model effectively forecasts total weekly sales for the upcoming year (365 days), capturing seasonal patterns and trends from historical data
* The forecast shows distinct seasonal peaks likely corresponding to holiday periods and sales events (e.g., Christmas, Black Friday)