Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.ensemble import RandomForestRegressor

loading the dataset using pandas and extracting some rows to understand structure of dataset

In [None]:
data= pd.read_csv('train.csv')#This is the historical training data of weekly sales from Walmart, which covers to 2010-02-05 to 2012-11-01
print("shape:",data.shape)
print(data.head())

in this step, we convert the 'Date' column into a datetime format for easier handling and checking for missing values and inspect some general statistics to understand the distribution the data

In [None]:
data['Date']= pd.to_datetime(data['Date'])

print(data.isnull().sum())

print(data.describe())

in this plot, we are visualizing total weekly sales over time to identify peaks and dips, which can provide insights into the events that affects sales

In [None]:
sales_over_time= data.groupby('Date')['Weekly_Sales'].sum().reset_index()# aggregate sales over time by grouping them by date
#and summing for each week

plt.figure(figsize=(14, 6))
sns.lineplot(data=sales_over_time,x='Date', y='Weekly_Sales')
plt.title('Total Weekly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.grid(True)
plt.tight_layout()
plt.show()


this plot shows the comparison between average weekly sales of 'Holiday' week and 'Non-Holiday' week

In [None]:
holiday_sales= data.groupby('IsHoliday')['Weekly_Sales'].mean().reset_index()#we've grouped the sales by whether week has holiday
#or not

plt.figure(figsize=(6, 5))
sns.barplot(data=holiday_sales,x='IsHoliday',y='Weekly_Sales')
plt.title('Average Weekly Sales: Holiday vs. Non-Holiday')
plt.xlabel('Is Holiday')
plt.ylabel('Average Weekly Sales')
plt.show()


in this block we are showing top 10 stores from the data having highest total sales

In [None]:
store_sales=data.groupby('Store')['Weekly_Sales'].sum().reset_index().sort_values(by='Weekly_Sales',ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(data=store_sales.head(10),x='Store',y='Weekly_Sales',order=store_sales.head(10)['Store'])
#order=store_sales.head(10)['Store'] this will arrange the bar plots in same order as the 'Store' i.e., descending order
plt.title('Top 10 Stores by Total Sales')
plt.xlabel('Store')
plt.ylabel('Total Sales')
plt.show()


this step is very important as date itself is a string and not a numerical value so it can't be feature for training the model so we have to break them down into features like day month and year and explicitly add them to the dataset as a feature to maintain the significance of date

In [None]:
data['Year']=data['Date'].dt.year
data['Month']=data['Date'].dt.month
data['WeekOfYear']=data['Date'].dt.isocalendar().week
data['DayOfWeek']=data['Date'].dt.weekday
print(data.head())

defining the features and target coulumns required for model training

In [None]:
x=data[['Store','Dept','IsHoliday','Year','Month','WeekOfYear','DayOfWeek']]#feature column
y=data['Weekly_Sales']#target column

dividing the Data for the model training and testing purpose

In [None]:
x_train,x_test,y_train,y_test= train_test_split(x, y, test_size=0.2, random_state=42)#traning on 80% and testing on rest 20%
#random_state=42 maintain the same order of shuffling to keep ensure model train on same data to prevent randomness in model

print(x_train.shape, x_test.shape)


creating a model using 'Random Forest Regressor Algorithm' and fitting it on the training data to learn patterns and make predictions on weekly sale

In [None]:
rf_model = RandomForestRegressor(n_estimators=100,random_state=42, n_jobs=-1)

rf_model.fit(x_train, y_train)


now testing the models prediction on the test dataset

In [None]:
y_pred = rf_model.predict(x_test)

checking for the errors made by model over predicted and actual values

In [None]:
mae=mean_absolute_error(y_test, y_pred)
r2=r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.4f}")


plotting a scatter plot of actual vs. predicted weekly sales with a perfect fit line to visually assess the accuracy of the model
the closer point lie to line the more accurate its prediction is

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test,y_pred,alpha=0.3,label='Predicted vs Actual')

#perfect-fit line
min_val=min(y_test.min(),y_pred.min())
max_val=max(y_test.max(),y_pred.max())
plt.plot([min_val, max_val],[min_val, max_val], color='green', linestyle='-', label='Perfect Fit (y=x)')
plt.xlabel('Actual Weekly Sales')
plt.ylabel('Predicted Weekly Sales')
plt.title('Actual vs Predicted Sales')
plt.legend()
plt.grid(alpha=0.3)
plt.show()




this plot shows how efficiently the model tracks the actual sales over a sample of first 50 data points highlighting
how closely the predictions follow the true values

In [None]:
comparison_df= pd.DataFrame({'Actual': y_test,'Predicted': y_pred}).reset_index(drop=True)

plt.figure(figsize=(14, 6))
plt.plot(comparison_df['Actual'][:50],label='Actual Weekly Sales',marker='o')
plt.plot(comparison_df['Predicted'][:50],label='Predicted Weekly Sales',marker='x')
plt.title('Actual vs Predicted Weekly Sales (First 50)')
plt.xlabel('Sample Index')
plt.ylabel('Weekly Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


creeating a user input interface to collect inputs required for the prediction of the weekly sales starting from a specific date

In [None]:
store = int(input("Enter the store number: "))
dept = int(input("Enter the department number: "))
holiday_input = input("Is it holiday (True/False)? ").strip().lower()

if holiday_input == 'true':#holiday values are needed to be converted in boolean
    holiday = True
else:
    holiday = False

date_str = input("Enter the date (YYYY-MM-DD): ").strip()
date_obj = pd.to_datetime(date_str)

year=date_obj.year
month= date_obj.month
week_of_year=date_obj.isocalendar().week
day_of_week=date_obj.weekday()#extracting information from the specified date

new_input=pd.DataFrame({
    'Store': [store], 
    'Dept': [dept], 
    'IsHoliday': [holiday], 
    'Year': [year],
    'Month': [month],
    'WeekOfYear': [week_of_year],
    'DayOfWeek': [day_of_week]#creating a dataset based on input, to be passed to model
})

print(new_input)


passing the user input in form of model ready dataframe to predict the weekly sales of that week

In [None]:
predicted_sales = rf_model.predict(new_input)
print(f"Predicted Weekly Sales: {predicted_sales[0]:.2f}")
