In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [7]:
df = pd.read_csv('indore_warehouse_sales.csv')
df['date'] = pd.to_datetime(df['date'])

In [8]:
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek

In [9]:
le_product = LabelEncoder()
df['product_id_enc'] = le_product.fit_transform(df['product_id'])

In [10]:
le_cat = LabelEncoder()
df['category_enc'] = le_cat.fit_transform(df['category'])


In [11]:
df_model = df.drop(columns=['date', 'product_id', 'product_name', 'category', 'city'])

In [12]:
X = df_model.drop('units_sold', axis=1)
y = df_model['units_sold']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)


In [14]:
y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


MAE: 22.736873626708984
RMSE: 27.43627982587278
R² Score: -0.131056547164917


In [15]:
# Predict for a single product
future_data = {
    'inventory': [80],
    'month': [7],
    'day': [20],
    'dayofweek': [6],  # Sunday
    'product_id_enc': [le_product.transform(['P001'])[0]],
    'category_enc': [le_cat.transform(['Electronics'])[0]]
}

future_df = pd.DataFrame(future_data)
future_prediction = model.predict(future_df)

print(f"Predicted units sold: {int(future_prediction[0])}")


Predicted units sold: 89


In [16]:
import joblib

# Save model
joblib.dump(model, 'walmart_model.pkl')
joblib.dump(le_product, "encoder_product.pkl")
joblib.dump(le_cat, "encoder_category.pkl")
# Load later
# model = joblib.load('walmart_sales_model.pkl')


['encoder_category.pkl']

In [17]:
from pytrends.request import TrendReq

# Connect to Google
pytrends = TrendReq(hl='en-US', tz=330)

In [18]:
# List of product-related search terms
keywords = ["AC Voltas", "Ceiling Fan", "Realme Narzo", "India Gate Rice", "Aashirvaad Atta"]

trends_data = pd.DataFrame()

for kw in keywords:
    pytrends.build_payload([kw], cat=0, timeframe='2025-05-01 2025-07-01', geo='IN', gprop='')
    data = pytrends.interest_over_time()
    if not data.empty:
        data = data[[kw]].reset_index()
        data.rename(columns={kw: kw.replace(" ", "_").lower()}, inplace=True)
        if trends_data.empty:
            trends_data = data
        else:
            trends_data = pd.merge(trends_data, data, on='date', how='outer')


In [19]:
# First make sure both datasets have 'date' in datetime format
df['date'] = pd.to_datetime(df['date'])
trends_data['date'] = pd.to_datetime(trends_data['date'])

# Merge Google Trends with main sales data
df_merged = pd.merge(df, trends_data, on='date', how='left')


In [20]:
df_merged.fillna(0, inplace=True)  # or use .interpolate()

# Same as before: feature engineering
df_merged['month'] = df_merged['date'].dt.month
df_merged['day'] = df_merged['date'].dt.day
df_merged['dayofweek'] = df_merged['date'].dt.dayofweek

# Label Encoding
df_merged['product_id_enc'] = le_product.transform(df_merged['product_id'])
df_merged['category_enc'] = le_cat.transform(df_merged['category'])

# Final dataset
feature_cols = ['inventory', 'month', 'day', 'dayofweek', 'product_id_enc', 'category_enc'] + list(trends_data.columns[1:])
X = df_merged[feature_cols]
y = df_merged['units_sold']

# Train/Test and model same as before


In [21]:
# Select final features (original + Google Trends)
feature_cols = [
    'inventory', 'month', 'day', 'dayofweek', 
    'product_id_enc', 'category_enc',
    'ac_voltas', 'ceiling_fan', 'realme_narzo',
    'india_gate_rice', 'aashirvaad_atta'
]

X = df_merged[feature_cols]
y = df_merged['units_sold']


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [23]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)


In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


MAE: 23.777271270751953
RMSE: 28.50018738802157
R² Score: -0.22047603130340576


In [25]:
# After merging and filling NaNs
trends_data.to_csv("trends.csv", index=False)
