## DEVELOPING A FORECASTING MODEL USING BUSINESS INTELLIGENCE TOOLS (BI)

## Apply random forest regression on each product separatedly 

In [1]:
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

pd.options.display.width = None
warnings.filterwarnings('ignore')
pd.options.display.max_rows = None
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # format numeric outputs

In [2]:
def generate_date_range(date_scaler, start_date, periods_num):

    dates = pd.date_range(start=start_date, periods=periods_num, freq='7D')
    dates = pd.to_numeric(dates)
    dates = date_scaler.transform(np.array(dates).reshape(-1,1))

    return dates

In [3]:
def get_date_range(start_date, periods_num):

    dates = pd.date_range(start = start_date, periods=periods_num, freq='7D')
    dates = pd.to_datetime(dates)

    return dates

In [4]:
df = pd.read_csv("weekly data.csv")
df['order_date'] = pd.to_datetime(df['order_date'])

In [5]:
frames = list()

for i in df['product_id'].unique():
    frame = df[df['product_id'] == i]
    frame = frame.sort_values(by='order_date')
    frames.append(frame)

df = pd.concat(frames, ignore_index=True)
df.head()

Unnamed: 0,order_date,product_id,quantities_sold,season
0,2019-09-01,8,4.0,Summer
1,2019-09-08,8,3.0,Summer
2,2019-09-15,8,0.0,Summer
3,2019-09-22,8,2.0,Summer
4,2019-09-29,8,2.0,Autumn


In [6]:
cor_list = list()
list_cor = list()
for i in df['product_id'].unique():
    info = dict()
    temp_f = df[df['product_id'] == i]

    temp_f['date'] = pd.to_datetime(temp_f['order_date'])
    temp_f['date'] = pd.to_numeric(temp_f['date'])
    temp_f['target'] = temp_f['quantities_sold']

    temp_f.drop(['order_date', 'product_id', 'season', 'quantities_sold'], axis=1, inplace=True)
    cor = temp_f.corr()

    info['product_id'] = [i]
    info['correlation with date'] = ['{:.2f}'.format(cor['date'][1])]
    
    if np.abs(cor['date'][1]) < 0.50:
        df.drop(df[df['product_id'] == i].index, inplace=True)
        
    list_cor.append(cor['date'][1])
    cor_list.append(pd.DataFrame(info))
pd.concat(cor_list, ignore_index=True).to_csv('correlation with date.csv', index=False)

In [None]:
fig = px.histogram(list_cor, nbins=20,title='Products correlation with date', text_auto=True)
fig.update_layout(bargap=0.4)
fig.update_xaxes(title_text='Correlation')
fig.update_yaxes(title_text='Count')
fig.show()

In [6]:
df.drop('season', axis=1, inplace=True)
df.head()

Unnamed: 0,order_date,product_id,quantities_sold
0,2019-09-01,8,4.0
1,2019-09-08,8,3.0
2,2019-09-15,8,0.0
3,2019-09-22,8,2.0
4,2019-09-29,8,2.0


In [13]:
df1 = df.copy()
df1.head()

Unnamed: 0,order_date,product_id,quantities_sold
0,2019-09-01,8,4.0
1,2019-09-08,8,3.0
2,2019-09-15,8,0.0
3,2019-09-22,8,2.0
4,2019-09-29,8,2.0


In [7]:
def plot_results(data_frame, model, scaler1, scaler2, product_id, p_num):
    frame = data_frame
    p_frame = frame[frame['product_id'] == product_id]
    p_frame['order_date'] = pd.to_datetime(p_frame['order_date'])
    p_frame = p_frame.sort_values(by='order_date')

    date = generate_date_range(scaler1, p_frame['order_date'].iloc[0], len(p_frame) + p_num)
    
    row = list()
    predicted_values = list()
    for j in date:
        row.append(j)
        arr = np.array(row).reshape(1, -1)
        predicted_values.append(scaler2.inverse_transform(np.array(model.predict(arr)[0]).reshape(1, -1))[0][0])
        row.clear()
    dates = get_date_range(p_frame['order_date'].iloc[0], len(p_frame) + p_num) 
    fig = px.line(x=p_frame['order_date'], y= p_frame['quantities_sold'], title='predicted values for prodcut id ' + str(product_id))
    fig.add_scatter(x=dates, y=predicted_values, name='predicted') 
    fig.update_xaxes(title='date (weeks)')
    fig.update_yaxes(title='quantity')
    fig.show()

In [8]:
def forecast_one_product(weekly_df, product_id):

    one_product = weekly_df[weekly_df['product_id'] == product_id]
    one_product = one_product.sort_values(by='order_date')

    sc1 = StandardScaler()
    sc2 = StandardScaler()

    sc1.fit(np.array(one_product['order_date']).reshape(-1, 1))
    sc2.fit(np.array(one_product['quantities_sold']).reshape(-1, 1))
    one_product.drop('product_id', axis=1, inplace=True)

    temp = sc1.transform(np.array(one_product['order_date']).reshape(-1,1))
    temp = pd.DataFrame(temp)
    temp.rename(columns={0:'order_date'}, inplace=True)

    new_df = pd.DataFrame()
    new_df = pd.concat([new_df, temp], axis=1)
    new_df['target'] = sc2.transform(np.array(one_product['quantities_sold']).reshape(-1,1)) 

    X_train = new_df.drop('target', axis=1)
    y_train = new_df['target']

    estimators = [i for i in range(50, 300, 10)]
    MSE_values = []

    for i in estimators:
        regressor = RandomForestRegressor(n_estimators=i, n_jobs=-1, random_state=42)
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_train)
        MSE = sc2.inverse_transform(np.array(mean_squared_error(y_train, y_pred)).reshape(1, -1))[0][0]
        MSE_values.append(MSE)

    regressor = RandomForestRegressor(n_estimators=estimators[MSE_values.index(np.min(MSE_values))], n_jobs=-1, random_state=42)
    regressor.fit(X_train, y_train)
    MSE = sc2.inverse_transform(np.array(mean_squared_error(y_train, y_pred)).reshape(1, -1))[0][0]
    
    print('Optimal got with {} esimators'.format(estimators[MSE_values.index(np.min(MSE_values))]))
    print('Training R^2 = {}'.format(regressor.score(X_train, y_train)))
    print('MSE = {}'.format(MSE))

    return regressor, sc1, sc2, product_id


In [None]:
for i in df['product_id'].unique():
    print(i)

In [15]:
product_model, scale1, scale2, prod_id = forecast_one_product(df, 15)  # df, product_id

Optimal got with 70 esimators
Training R^2 = 0.9569154669283246
MSE = 2060.581187667448


In [None]:
plot_results(df1, product_model, scale1, scale2, prod_id, 3)  # number predicted values

In [None]:
for i in df['product_id'].unique():
    product_model, scale1, scale2, prod_id = forecast_one_product(df, i)
    plot_results(df1, product_model, scale1, scale2, prod_id, 5)
    