In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder

# Preprocessing and data cleaning:

In [17]:
df = pd.read_csv("/kaggle/input/productdemandforecasting/Historical Product Demand.csv")
df.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012/7/27,100
1,Product_0979,Whse_J,Category_028,2012/1/19,500
2,Product_0979,Whse_J,Category_028,2012/2/3,500
3,Product_0979,Whse_J,Category_028,2012/2/9,500
4,Product_0979,Whse_J,Category_028,2012/3/2,500


* Let's figure out how the product codes are distributed:


In [48]:
product_code_counts = df["Product_Code"].value_counts()
print(product_code_counts.mean())
print(product_code_counts.min())
print(product_code_counts.max()) 

485.4513888888889
1
16936


In [81]:
import plotly.express as px

product_code_counts_df = product_code_counts.reset_index()
product_code_counts_df.columns = ['Product_Code', 'Count']
plt.figure(figsize=(12, 8))
fig = px.box(product_code_counts_df, x='Count', 
             title='Horizontal Box plot of Product_Code counts', 
             labels={'Count': 'Count', 'count': 'Frequency'})

fig.update_traces(hovertemplate='Count: %{x}<br>Frequency: %{y}')

fig.show()


<Figure size 1200x800 with 0 Axes>

In [82]:
plt.figure(figsize=(12, 8))

fig = px.histogram(product_code_counts_df, x='Count', 
                   title='Hist Plot of Product_Code Counts', 
                   labels={'Count': 'Count', 'count': 'Frequency'})

fig.update_traces(hovertemplate='Count: %{x}<br>Frequency: %{y}')
fig.show()

<Figure size 1200x800 with 0 Axes>

In [85]:
violin_plot = px.violin(product_code_counts_df, x='Count',
                              title='Violin Plot of Product_Code Counts',
                              labels={'Count': 'Count', 'count': 'Frequency'})
violin_plot.show()

In [75]:
product_code_counts_df[product_code_counts_df["Count"] < 99].count()

Product_Code    562
Count           562
dtype: int64

In [7]:
df.isnull().sum()

Product_Code            0
Warehouse               0
Product_Category        0
Date                11239
Order_Demand            0
dtype: int64

In [8]:
(df["Date"].isnull().sum() / df["Date"].size) * 100

1.0718355863910547

*Conclusion: 
It's just one percent of data, so we can replace them with sth and then if we see not coorelation we can delete the corresponding rows.*

In [9]:
df["Date"].fillna("0000/0/0" ,inplace=True)

In [10]:
df.isnull().sum()

Product_Code        0
Warehouse           0
Product_Category    0
Date                0
Order_Demand        0
dtype: int64

In [16]:
df["Product_Code"].value_counts().mean()

485.4513888888889

In [24]:
grouped_df = X.grouaggregatepby("Date")
grouped_df.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date
0,Product_0993,Whse_J,Category_028,2012/7/27
1,Product_0979,Whse_J,Category_028,2012/1/19
2,Product_0979,Whse_J,Category_028,2012/2/3
3,Product_0979,Whse_J,Category_028,2012/2/9
4,Product_0979,Whse_J,Category_028,2012/3/2
...,...,...,...,...
1020886,Product_1287,Whse_J,Category_019,2016/11/27
1025178,Product_1423,Whse_J,Category_019,2016/3/27
1028030,Product_1439,Whse_J,Category_019,2016/3/27
1038439,Product_0445,Whse_J,Category_015,2016/2/21


In [21]:
X = df.drop(["Order_Demand"], axis = 1)
Y = pd.DataFrame(df["Order_Demand"])

In [22]:
X.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date
0,Product_0993,Whse_J,Category_028,2012/7/27
1,Product_0979,Whse_J,Category_028,2012/1/19
2,Product_0979,Whse_J,Category_028,2012/2/3
3,Product_0979,Whse_J,Category_028,2012/2/9
4,Product_0979,Whse_J,Category_028,2012/3/2


In [20]:
Y.head()

Unnamed: 0,Order_Demand
0,100
1,500
2,500
3,500
4,500


In [None]:
df["Warehouse"].describe()

In [None]:
df["Product_Code"].describe()

In [None]:
df["Product_Category"].describe()

In [None]:
label_encoder = LabelEncoder()
X ["Product_Code"] = label_encoder.fit_transform(X["Product_Code"])
    
X = pd.get_dummies(X, columns = ["Warehouse" ,"Product_Category"], prefix = [None, "Product"])    
X.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,
                                                    test_size = 0.2 ,random_state =42)

In [None]:
print(x_train.size)
print(x_test.size)