In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from PIL import Image as PILImage
from IPython.display import display

# Load the WebP image using PIL
image_filename = '/kaggle/input/salesforecast/Saleforecast.webp'
webp_image = PILImage.open(image_filename)

# Convert the image to RGB format (required for PNG)
rgb_image = webp_image.convert('RGB')

# Display the image
display(rgb_image)

<h1 style="font-size: 35px; font-style: Bold; color: black;">Sales Forecating with TimeSeries, ARIMA, SARIMA, LightGBM, RandomForest, LSTM</h1>


# Importing Necessary Libraries 

In [None]:
pip install pmdarima

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from tensorflow import keras 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, LSTM
import warnings
warnings.filterwarnings("ignore") # suppressing warnings

In [None]:
df = pd.read_csv('/kaggle/input/sales-forecasting/train.csv')
df.head(5)

In [None]:
df.tail()

In [None]:
# getting more information from the data
df.info()

In [None]:
# checking how many rows and columns in the dataset
df.shape

In [None]:
# statistical summary
df.describe()

In [None]:
# printing the column names in the data
df.columns

In [None]:
# data types of columns
df.dtypes

# Cleaning Dataset by Removing Missing Values & Duplicates

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.duplicated().any() # use df.drop_duplicates(inplace=True) -> if any duplicates exist in the dataframe

In [None]:
# removing missing values
df.dropna(axis=0, inplace=True)

# Customer Segmentation Analysis

In [None]:
df.columns

In [None]:
features = df[['Segment','Country','City','State','Sales']]
top_10_sales = features.nlargest(10, 'Sales')
print('Top 10 Sales Revenue')
print(top_10_sales[['Segment','Country','City','State','Sales']])

In [None]:
segment_sales = df.groupby('Segment')['Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(20,6))
plt.subplot(1,3,1)
plt.subplot(1,3,1)
segment_sales.plot(kind='bar',color='green')
plt.title('Sales by Segment')
plt.xlabel('Segment')
plt.ylabel('Total Sales')
plt.show()

In [None]:
segment_sales = df.groupby('City')['Sales'].sum().sort_values(ascending=False).head(5)
plt.figure(figsize=(20,6))
plt.subplot(1,3,2)
segment_sales.plot(kind='bar',color='purple')
plt.title('Top 5 Cities Based on Sales')
plt.xlabel('City')
plt.ylabel('Total Sales')
plt.show()

In [None]:
segment_sales = df.groupby('State')['Sales'].sum().sort_values(ascending=False).head(5)
plt.figure(figsize=(20,6))
plt.subplot(1,3,3)
segment_sales.plot(kind='bar',color='lightgreen')
plt.title('Top 5 States Based on Sales')
plt.xlabel('State')
plt.ylabel('Total Sales')
plt.show()

In [None]:
heatmap_data = df.pivot_table(index='Segment', columns='Category', values='Sales', aggfunc='sum')
plt.figure(figsize=(10,6))
sns.heatmap(heatmap_data, annot=True, cmap='viridis', fmt='.0f', cbar_kws={'label':'Total Sales'})
plt.title("Segment Sales by Product Category")
plt.show()

# Analyzing Order Fulfilment Efficiency

In [None]:
# converting into datetime object
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%d/%m/%Y')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], format='%d/%m/%Y')
df['Processing Time'] = (df['Ship Date']- df['Order Date']).dt.days
median_processing_time =df.groupby('Sub-Category')['Processing Time'].median().sort_values()
print('Median Processing Time for each Product Sub-Category')
print(median_processing_time)

In [None]:
plt.figure(figsize=(12,6))
median_processing_time.plot(kind='bar',color='yellow')
plt.title('Median Processing Time by Product Sub-Category')
plt.xlabel('Product Sub-Category')
plt.ylabel('Median Processing Time(Days)')
plt.show()

# Analyzing Sales Performance Trend

In [None]:
# converting into datetime format
df['Order Date']= pd.to_datetime(df['Order Date'])
df['Year'] = df['Order Date'].dt.year

In [None]:
highest_selling_product = df.groupby(['Year', 'Category', 'Sub-Category'])['Sales'].sum().reset_index()

# Find the index of the row with the highest sales in each year
idx = highest_selling_product.groupby('Year')['Sales'].idxmax()

# Select the corresponding rows
highest_selling_product = highest_selling_product.loc[idx]

print('Best Performance Product Category and Sub Category for Each Year')
print(highest_selling_product[['Year', 'Category', 'Sub-Category', 'Sales']])


In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Year', y='Sales', hue='Sub-Category', data=highest_selling_product)
plt.title('Best Performing Product Sub Category for Each Year')
plt.xlabel('Product Sub-Category')
plt.ylabel('Sales Revenue')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()


# Forecasting Sales with ARIMA

In [None]:
#pip install pmdarima

In [None]:
# sorting date
df["Order Date"] = pd.to_datetime(df["Order Date"], format='%d/%m/%Y')
sorted_date = df["Order Date"].sort_values()
print(sorted_date)

In [None]:
from pmdarima import auto_arima
sales_by_category = df.groupby(["Category", df["Order Date"].dt.year])["Sales"].sum().reset_index()

# Define a function to forecast sales for each category
def forecast_sales(category_data):
    sales_series = category_data.set_index("Order Date")["Sales"]
    model = auto_arima(sales_series, seasonal=True, suppress_warnings=True, stepwise=True)
    forecast = model.predict(n_periods=1)
    return pd.Series({
        "Category": category_data["Category"].iloc[0],
        "Forecasted_Sales_2019": forecast.sum()
    })

# Applying the forecast_sales function to each category group
forecasted_sales = sales_by_category.groupby("Category").apply(forecast_sales).reset_index(drop=True)

# Formatting the Forecasted_Sales_2019 column
forecasted_sales["Forecasted_Sales_2019"] = forecasted_sales["Forecasted_Sales_2019"].apply(lambda x: '{:,.2f}'.format(x))

# Displaying the result
print("Forecasted Sales in 2019 for Each Product Category:")
print(forecasted_sales[["Category", "Forecasted_Sales_2019"]])

# Forecasting Sales with SARIMA

In [None]:
#pip install statsmodels

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX


sales_by_category = df.groupby(["Category", df["Order Date"].dt.year])["Sales"].sum().reset_index()

# Define a function to forecast sales for each category using SARIMAX
def forecast_sales(category_data):
    sales_series = category_data.set_index("Order Date")["Sales"]
    sarima_model = SARIMAX(sales_series, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    sarima_results = sarima_model.fit()
    forecast = sarima_results.get_forecast(steps=1).predicted_mean
    return pd.Series({
        "Category": category_data["Category"].iloc[0],
        "Forecasted_Sales_2019": forecast.iloc[0]
    })

# Applying the forecast_sales function to each category group
forecasted_sales = sales_by_category.groupby("Category").apply(forecast_sales).reset_index(drop=True)

# Formatting the Forecasted_Sales_2019 column
forecasted_sales["Forecasted_Sales_2019"] = forecasted_sales["Forecasted_Sales_2019"].apply(lambda x: '{:,.2f}'.format(x))

# Displaying the result
print("Forecasted Sales in 2019 for Each Product Category:")
print(forecasted_sales[["Category", "Forecasted_Sales_2019"]])


# Forecasting Sales with LightGBM

In [None]:
#pip install lightgbm scikit-learn

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame
sales_by_category = df.groupby(["Category", df["Order Date"].dt.year])["Sales"].sum().reset_index()
forecasted_sales_list = []  # List to store individual DataFrames
target_year = 2020

# Fitting lightgbm model
for category in sales_by_category["Category"].unique():
    category_data = sales_by_category[sales_by_category["Category"] == category]
    category_data["Order Date"] = pd.to_datetime(category_data["Order Date"])
    category_data["Year"] = category_data["Order Date"].dt.year
    train_data, valid_data = train_test_split(category_data, test_size=0.2, shuffle=False)
    features = ["Year"]
    target = "Sales"
    train_dataset = lgb.Dataset(train_data[features], label=train_data[target])
    params = {
        "objective": "regression",
        "metric": "mse",
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "learning_rate": 0.05
    }
    model = lgb.train(params, train_dataset, num_boost_round=1000)
    forecast_data = pd.DataFrame({"Year": [target_year]})
    forecast = model.predict(forecast_data)
    
    # Append individual DataFrames to the list
    forecasted_sales_list.append(pd.DataFrame({
        "Category": [category],
        "Forecasted_Sales_2020": forecast.sum()
    }))

# Concatenate individual DataFrames into the final result
forecasted_sales = pd.concat(forecasted_sales_list, ignore_index=True)

# Format the Forecasted_Sales_2020 column
forecasted_sales["Forecasted_Sales_2020"] = forecasted_sales["Forecasted_Sales_2020"].apply(lambda x: '{:,.2f}'.format(x))

print("Forecasted Sales in 2020 for Each Product Category:")
print(forecasted_sales[["Category", "Forecasted_Sales_2020"]])

# Forecasting Sales with Random Forest

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Grouping the data and resetting index
sales_by_category = df.groupby(["Category", df["Order Date"].dt.year])["Sales"].sum().reset_index(name="Sales")

# Initialize the DataFrame outside the loop
forecasted_sales = pd.DataFrame(columns=["Category", "Forecasted_Sales_2020"])
target_year = 2020

for category in sales_by_category["Category"].unique():
    # Filter data for the current category
    category_data = sales_by_category[sales_by_category["Category"] == category]
    
    # Ensure that 'Order Date' is interpreted as an integer (year), not a date
    category_data = category_data.rename(columns={"Order Date": "Year"})
    
    # Split the data into training and validation sets
    train_data, valid_data = train_test_split(category_data, test_size=0.2, shuffle=False)
    
    # Define features and target
    features = ["Year"]  # 'Year' instead of 'Order Date'
    target = "Sales"
    
    # Instantiate and train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(train_data[features], train_data[target])
    
    # Create forecast data for the target year
    forecast_data = pd.DataFrame({
        "Year": [target_year]
    })
    
    # Predicting the sales for the target year
    forecast = model.predict(forecast_data)
    
    # Create a DataFrame for the new row
    new_row = pd.DataFrame({
        "Category": [category],
        "Forecasted_Sales_2020": [forecast[0]]
    })
    
    # Appending the new row to the forecasted_sales DataFrame
    forecasted_sales = pd.concat([forecasted_sales, new_row], ignore_index=True)

# Formatting the 'Forecasted_Sales_2020' column to have commas and two decimal places
forecasted_sales["Forecasted_Sales_2020"] = forecasted_sales["Forecasted_Sales_2020"].apply(lambda x: '{:,.2f}'.format(x))

# Printing the forecasted sales for each category for the year 2020
print("Forecasted Sales in 2020 for Each Product Category:")
print(forecasted_sales)


# Forecasting Sales with LSTM

In [None]:
#pip install tensorflow

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense


# Grouping the data and resetting the index
sales_by_category = df.groupby(["Category", df["Order Date"].dt.year])["Sales"].sum().reset_index()

# Initialize the DataFrame outside the loop
forecasted_sales = pd.DataFrame(columns=["Category", "Forecasted_Sales_2020"])
target_year = 2020

for category in sales_by_category["Category"].unique():
    category_data = sales_by_category[sales_by_category["Category"] == category]
    
    # Scale the 'Sales' data
    scaler = MinMaxScaler()
    category_data['Sales'] = scaler.fit_transform(category_data['Sales'].values.reshape(-1, 1))
    
    # Prepare the features and target for the LSTM model
    X = category_data['Sales'].values
    y = category_data['Sales'].values
    X = X.reshape((X.shape[0], 1, 1))
    
    # Define and compile the LSTM model
    model = Sequential()
    model.add(LSTM(50, input_shape=(1, 1)))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    
    # Fit the model
    model.fit(X, y, epochs=100, batch_size=16, verbose=0)
    
    # Prepare the forecast data
    forecast_data = np.array([scaler.transform([[target_year]])])
    forecast_data = forecast_data.reshape((1, 1, 1))
    
    # Make the forecast
    forecast = model.predict(forecast_data)
    forecast = scaler.inverse_transform(forecast.reshape(-1, 1))[0][0]
    
    # Add the forecast to the forecasted_sales DataFrame
    new_row = pd.DataFrame({
        "Category": [category],
        "Forecasted_Sales_2020": [forecast]
    })
    forecasted_sales = pd.concat([forecasted_sales, new_row], ignore_index=True)

# Format the 'Forecasted_Sales_2020' column
forecasted_sales["Forecasted_Sales_2020"] = forecasted_sales["Forecasted_Sales_2020"].apply(lambda x: '{:,.2f}'.format(x))

# Print the forecasted sales
print("Forecasted Sales in 2020 for Each Product Category:")
print(forecasted_sales)

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Grouping the data and resetting the index
sales_by_category = df.groupby(["Category", df["Order Date"].dt.year])["Sales"].sum().reset_index()

# Initialize the DataFrame outside the loop
forecasted_sales = pd.DataFrame(columns=["Category", "Forecasted_Sales_2020"])
target_year = 2020

for category in sales_by_category["Category"].unique():
    category_data = sales_by_category[sales_by_category["Category"] == category]
    category_data = category_data.assign(Year=pd.to_datetime(category_data["Order Date"]).dt.year)
    train_data, valid_data = train_test_split(category_data, test_size=0.2, shuffle=False)
    features = ["Year"]
    target = "Sales"
    
    train_dataset = lgb.Dataset(train_data[features], label=train_data[target])
    params = {
        "objective": "regression",
        "metric": "mse",
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "learning_rate": 0.05
    }
    
    model = lgb.train(params, train_dataset, num_boost_round=1000)
    forecast_data = pd.DataFrame({
        "Year": [target_year]
    })
    
    forecast = model.predict(forecast_data)
    new_row = pd.DataFrame({
        "Category": [category],
        "Forecasted_Sales_2020": [forecast.sum()]
    })
    forecasted_sales = pd.concat([forecasted_sales, new_row], ignore_index=True)

forecasted_sales["Forecasted_Sales_2020"] = forecasted_sales["Forecasted_Sales_2020"].apply(lambda x: '{:,.2f}'.format(x))
print("Forecasted Sales in 2020 for Each Product Category:")
print(forecasted_sales)

result = []
for category in sales_by_category["Category"].unique():
    category_data = sales_by_category[sales_by_category["Category"] == category]
    category_data = category_data.assign(Year=pd.to_datetime(category_data["Order Date"]).dt.year)
    train_data, valid_data = train_test_split(category_data, test_size=0.2, shuffle=False)
    features = ["Year"]
    target = "Sales"
    
    train_dataset = lgb.Dataset(train_data[features], label=train_data[target])
    model = lgb.train(params, train_dataset, num_boost_round=1000)
    forecast_data = pd.DataFrame({
        "Year": [target_year]
    })
    
    forecast = model.predict(forecast_data)
    true_values = valid_data[target]
    predicted_values = model.predict(valid_data[features])
    mae = mean_absolute_error(true_values, predicted_values)
    residuals = true_values - predicted_values
    result.append({
        "Category": category,
        "True_Values": true_values.values,
        "Predicted_Values": predicted_values,
        "MAE": mae,
        "Residual": residuals.values
    })

for i in result:
    print(f"Product Category: {i['Category']}")
    print(f"True Values: {i['True_Values']}")
    print(f"Predicted_Values: {i['Predicted_Values']}")
    print(f"MAE: {i['MAE']}")
    print(f"Residual: {i['Residual']}")
