In [1]:
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import statsmodels.api as sm

pd.options.display.width = None
warnings.filterwarnings('ignore')
pd.options.display.max_rows = None
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # format numeric outputs

In [2]:
def generate_date_range(date_scaler, start_date, periods_num):

    dates = pd.date_range(start = start_date, periods=periods_num, freq='1M')
    dates = pd.to_numeric(dates)
    dates = date_scaler.transform(np.array(dates).reshape(-1,1))

    return dates
  

In [3]:
def get_date_range(start_date, periods_num):

    dates = pd.date_range(start = start_date, periods=periods_num, freq='1M')
    dates = pd.to_datetime(dates)

    return dates

In [4]:
df = pd.read_csv('monthly data.csv')
df['order_date'] = pd.to_datetime(df['order_date'])

In [5]:
frames = list()

for i in df['product_id'].unique():
    frame = df[df['product_id'] == i]
    frame = frame.sort_values(by='order_date')
    frames.append(frame)

df = pd.concat(frames, ignore_index=True)
df.head()

Unnamed: 0,order_date,product_id,quantities_sold
0,2019-09-30,8,11.0
1,2019-10-31,8,62.0
2,2019-11-30,8,151.0
3,2019-12-31,8,588.0
4,2020-01-31,8,719.0


In [6]:
print('Number of products = {} products'.format(len(df['product_id'].unique())))

Number of products = 844 products


In [7]:
print('Size of data frame = {} records'.format(len(df)))

Size of data frame = 22296 records


# Calculate correlation with date

In [8]:
cor_list = list()
list_cor = list()
for i in df['product_id'].unique():
    info = dict()
    temp_f = df[df['product_id'] == i]

    temp_f['date'] = pd.to_datetime(temp_f['order_date'])
    temp_f['date'] = pd.to_numeric(temp_f['date'])
    temp_f['target'] = temp_f['quantities_sold']

    temp_f.drop(['order_date', 'product_id', 'quantities_sold'], axis=1, inplace=True)
    cor = temp_f.corr()

    info['product_id'] = [i]
    info['correlation with date'] = ['{:.2f}'.format(cor['date'][1])]
    
    if np.abs(cor['date'][1]) < 0.50:
        df.drop(df[df['product_id'] == i].index, inplace=True)
        
    list_cor.append(cor['date'][1])
    cor_list.append(pd.DataFrame(info))
pd.concat(cor_list, ignore_index=True).to_csv('correlation with date monthly.csv', index=False)

* We will ignore products that have correlation less then 0.50
* So, the number of products will decrease as there are prodcts have correlation less than 0.20

In [9]:
df['order_date'] = pd.to_numeric(df['order_date'])

In [10]:
sc1 = StandardScaler()
sc2 = StandardScaler()

sc1.fit(np.array(df['order_date']).reshape(-1, 1))
sc2.fit(np.array(df['quantities_sold']).reshape(-1, 1))

StandardScaler()

In [11]:
print('Number of products have correlation greater than or equal 0.50 = {} products'.format(len(df['product_id'].unique())))

Number of products have correlation greater than or equal 0.50 = 625 products


In [12]:
print('Size of data frame after cancelling products with no crrelation = {} records'.format(len(df)))

Size of data frame after cancelling products with no crrelation = 16698 records


In [None]:
fig = px.histogram(list_cor, nbins=20,title='Products correlation with date', text_auto=True)
fig.update_layout(bargap=0.4)
fig.update_xaxes(title_text='Correlation')
fig.update_yaxes(title_text='Count')
fig.show()

# Take 2 months from from each prodct to be used in testing

In [14]:
test_set = pd.DataFrame()
frames = list()

for i in df['product_id'].unique():
    frame = df[df['product_id'] == i]
    test_set = pd.concat([test_set, frame.tail(2)], ignore_index=True)
    frame.drop(frame.tail(2).index, inplace=True)
    frames.append(frame)

df = pd.concat(frames, ignore_index=True)

In [15]:
len(df), len(test_set)

(15448, 1250)

In [16]:
temp = sc1.transform(np.array(df['order_date']).reshape(-1,1))
temp = pd.DataFrame(temp)
temp.rename(columns={0:'order_date'}, inplace=True)

dummy1 = pd.get_dummies(df['product_id'])
dummy1.drop(8, axis=1, inplace=True)

new_df = pd.DataFrame()
new_df = pd.concat([new_df, temp, dummy1], axis=1)

new_df['target'] = sc2.transform(np.array(df['quantities_sold']).reshape(-1,1)) 
new_df.head()


Unnamed: 0,order_date,10,11,12,13,14,15,16,19,22,...,4383,4391,4394,4395,4396,4398,4407,4409,4415,target
0,-1.78375,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.30311
1,-1.65335,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.25943
2,-1.52716,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.18321
3,-1.39676,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.19105
4,-1.26636,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.30325


In [17]:
temp = sc1.transform(np.array(test_set['order_date']).reshape(-1, 1))
temp = pd.DataFrame(temp)
temp.rename(columns={0:'order_date'}, inplace=True)

dummy1 = pd.get_dummies(test_set['product_id'])
dummy1.drop(8, axis=1,inplace=True)

new_test_set = pd.DataFrame()
new_test_set = pd.concat([new_test_set, temp, dummy1], axis=1)
new_test_set['target'] = sc2.transform(np.array(test_set['quantities_sold']).reshape(-1,1))

new_test_set.head()

Unnamed: 0,order_date,10,11,12,13,14,15,16,19,22,...,4383,4391,4394,4395,4396,4398,4407,4409,4415,target
0,1.54776,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.54836
1,1.67816,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.10036
2,1.54776,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.14481
3,1.67816,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.083
4,1.54776,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.09414


In [18]:
X_train = new_df.drop('target', axis=1)
y_train = new_df['target']

X_test = new_test_set.drop('target', axis=1)
y_test = new_test_set['target']

len(X_train), len(X_test), len(y_train), len(y_test)

(15448, 1250, 15448, 1250)

# Fitting RandomForestRegressor

In [19]:
regressor = RandomForestRegressor(n_estimators=150, n_jobs=-1, random_state=42)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=150, n_jobs=-1, random_state=42)

In [20]:
regressor.score(X_train, y_train)

0.7505713314071936

In [21]:
y_pred = regressor.predict(X_test)
mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)

(1.4712745638715947, 3.1321502417147307)

In [22]:
MAE = sc2.inverse_transform(np.array(mean_absolute_error(y_test, y_pred)).reshape(1, -1))[0][0]
MSE = sc2.inverse_transform(np.array(mean_squared_error(y_test, y_pred)).reshape(1, -1))[0][0]

print('MAE = {}\nMSE = {}'.format(MAE, MSE))

MAE = 2082.8275909471918
MSE = 4022.1218874022843


In [None]:
error = sc2.inverse_transform(np.array(y_test - y_pred).reshape(1,-1))
temp = list()
for i in error:
    for j in i:
        temp.append(j)

error = temp

fig = px.histogram(error, nbins=20,title='Monthly error (Random forest regression) test size ' + str(len(X_test)) + ' records',
                    text_auto=True)
fig.update_layout(bargap=0.4)
fig.update_xaxes(title_text='Error')
fig.update_yaxes(title_text='Count')
fig.show()

In [28]:
def predict_function(model, scaler1, scaler2, product_id):
    frame = pd.read_csv("monthly data.csv")
    p_frame = frame[frame['product_id'] == product_id]
    p_frame['order_date'] = pd.to_datetime(p_frame['order_date'])
    p_frame = p_frame.sort_values(by='order_date')

    date = generate_date_range(scaler1, p_frame['order_date'].iloc[0], len(p_frame)+2)
    p_id = product_id

    if p_id not in list(X_train.columns) and p_id != 8:
        print('Product is not found')
    else:
        row = list()
        predicted_values = list()
        for j in date:
            row.append(j)
            for i in X_train.columns:
                if i == p_id:
                    row.append(1)
                elif str(i).isnumeric():
                    row.append(0)
                    
            arr = np.array(row).reshape(1, -1)
            predicted_values.append(scaler2.inverse_transform(np.array(model.predict(arr)[0]).reshape(1, -1))[0][0])
            row.clear()

        dates = get_date_range(p_frame['order_date'].iloc[0], len(p_frame)+2)
        fig = px.line(x=p_frame['order_date'], y= p_frame['quantities_sold'], title='predicted values for prodcut id ' + str(p_id))
        fig.add_scatter(x=dates, y=predicted_values, name='predicted') 
        fig.update_xaxes(title='date')
        fig.update_yaxes(title='quantity')
        fig.show()

# Save the model and load it 

In [35]:
import pickle
pickle.dump(regressor, open('random_forest_625_products_monthly.pkl', 'wb'))

In [None]:
import pickle
loaded_model = pickle.load(open('random_forest_625_products_monthly.pkl', 'rb'))

In [None]:
for i in range(0, len(X_train.columns), 10):
    print(X_train.columns[i: i + 10])

In [None]:
predict_function(regressor, sc1, sc2, 3422)

### In random forests, there is no need for cross-validation or, Each tree is constructed using a different bootstrap sample from the original data.

# Using linear regression

In [40]:
df = pd.read_csv("monthly data.csv")
df['order_date'] = pd.to_datetime(df['order_date'])
df.head()

Unnamed: 0,order_date,product_id,quantities_sold
0,2019-09-30,8,11.0
1,2019-10-31,8,62.0
2,2019-11-30,8,151.0
3,2019-12-31,8,588.0
4,2020-01-31,8,719.0


In [41]:
df['order_date'] = pd.to_numeric(df['order_date'])

sc1 = StandardScaler()
sc2 = StandardScaler()

sc1.fit(np.array(df['order_date']).reshape(-1, 1))
sc2.fit(np.array(df['quantities_sold']).reshape(-1, 1))

StandardScaler()

In [42]:
print('Number of products = {}'.format(len(df['product_id'].unique())))

Number of products = 844


In [43]:
print('Size of data frame = {} records'.format(len(df)))

Size of data frame = 22296 records


In [44]:
cor_list = list()
list_cor = list()
for i in df['product_id'].unique():
    info = dict()
    temp_f = df[df['product_id'] == i]

    temp_f['date'] = pd.to_datetime(temp_f['order_date'])
    temp_f['date'] = pd.to_numeric(temp_f['date'])
    temp_f['target'] = temp_f['quantities_sold']

    temp_f.drop(['order_date', 'product_id', 'quantities_sold'], axis=1, inplace=True)
    cor = temp_f.corr()

    info['product_id'] = [i]
    info['correlation with date'] = ['{:.2f}'.format(cor['date'][1])]
    
    if np.abs(cor['date'][1]) < 0.50:
        df.drop(df[df['product_id'] == i].index, inplace=True)
        
    list_cor.append(cor['date'][1])
    cor_list.append(pd.DataFrame(info))
pd.concat(cor_list, ignore_index=True).to_csv('correlation with date monthly.csv', index=False)

In [45]:
print('Number of products have correlation greater than or equal 0.50 = {}  products'.format(len(df['product_id'].unique())))

Number of products have correlation greater than or equal 0.50 = 625  products


In [46]:
print('Size of data frame after cancelling products with no crrelation = {} records'.format(len(df)))

Size of data frame after cancelling products with no crrelation = 16698 records


# Take 2 months from from each prodct to be used in testing

In [47]:
test_set = pd.DataFrame()
frames = list()

for i in df['product_id'].unique():
    frame = df[df['product_id'] == i]
    frame = frame.sort_values(by='order_date')
    test_set = pd.concat([test_set, frame.tail(2)], ignore_index=True)
    frame.drop(frame.tail(2).index, inplace=True)
    frames.append(frame)

df = pd.concat(frames, ignore_index=True)
df.head()

Unnamed: 0,order_date,product_id,quantities_sold
0,1569801600000000000,8,11.0
1,1572480000000000000,8,62.0
2,1575072000000000000,8,151.0
3,1577750400000000000,8,588.0
4,1580428800000000000,8,719.0


In [48]:
len(df), len(test_set)

(15448, 1250)

In [49]:
df.head()

Unnamed: 0,order_date,product_id,quantities_sold
0,1569801600000000000,8,11.0
1,1572480000000000000,8,62.0
2,1575072000000000000,8,151.0
3,1577750400000000000,8,588.0
4,1580428800000000000,8,719.0


In [50]:
temp = sc1.transform(np.array(df['order_date']).reshape(-1, 1))
temp = pd.DataFrame(temp)
temp.rename(columns={0:'order_date'}, inplace=True)


dummy1 = pd.get_dummies(df['product_id'])
dummy1.drop(8, axis=1, inplace=True)

new_df = pd.DataFrame()
new_df = pd.concat([new_df, temp, dummy1], axis=1)
new_df['target'] = sc2.transform(np.array(df['quantities_sold']).reshape(-1, 1))
new_df.head(10)


Unnamed: 0,order_date,10,11,12,13,14,15,16,19,22,...,4383,4391,4394,4395,4396,4398,4407,4409,4415,target
0,-1.80119,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.27028
1,-1.66993,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.22049
2,-1.54291,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.13362
3,-1.41166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.29294
4,-1.28041,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.42081
5,-1.15763,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.41886
6,-1.02637,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.86494
7,-0.89936,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.4296
8,-0.7681,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.72634
9,-0.64108,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.97817


In [51]:
dummy1 = pd.get_dummies(test_set['product_id'])
dummy1.drop(8, axis=1,inplace=True)

temp = sc1.transform(np.array(test_set['order_date']).reshape(-1, 1))
temp = pd.DataFrame(temp)
temp.rename(columns={0:'order_date'}, inplace=True)

new_test_set = pd.DataFrame()
new_test_set = pd.concat([new_test_set, temp, dummy1], axis=1)
new_test_set['target'] = sc2.transform(np.array(test_set['quantities_sold']).reshape(-1,1))

new_test_set.head()

Unnamed: 0,order_date,10,11,12,13,14,15,16,19,22,...,4383,4391,4394,4395,4396,4398,4407,4409,4415,target
0,1.5521,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.53863
1,1.68335,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.6088
2,1.5521,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.24023
3,1.68335,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.01942
4,1.5521,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.0321


In [53]:
X_train = new_df.drop('target', axis=1)
y_train = new_df['target']

X_test = new_test_set.drop('target', axis=1)
y_test = new_test_set['target']

len(X_train), len(X_test), len(y_train), len(y_test)


(15448, 1250, 15448, 1250)

In [54]:
lin_regressor = LinearRegression()
lin_regressor.fit(X_train, y_train)

LinearRegression()

In [55]:
lin_regressor.score(X_train, y_train)

0.6253849695281457

In [None]:
X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

In [57]:
y_pred = lin_regressor.predict(X_test)

mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)

(0.49978609239128235, 0.7523486156421124)

In [58]:
MAE = sc2.inverse_transform(np.array(mean_absolute_error(y_test, y_pred)).reshape(1, -1))[0][0]
MSE = sc2.inverse_transform(np.array(mean_squared_error(y_test, y_pred)).reshape(1, -1))[0][0]

print('MAE = {}\nMSE = {}'.format(MAE, MSE))

MAE = 799.9069025923925
MSE = 1058.6499122815628


In [59]:
lin_regressor.score(X_test, y_test)

0.6631964634384608

In [None]:
error = sc2.inverse_transform(np.array(y_test - y_pred).reshape(1,-1))
temp = list()
for i in error:
    for j in i:
        temp.append(j)

error = temp

fig = px.histogram(error, nbins=20,title='Monthly error (Linear regression)', text_auto=True)
fig.update_layout(bargap=0.4)
fig.update_xaxes(title_text='Error')
fig.update_yaxes(title_text='Count')
fig.show()

In [64]:
frame = pd.read_csv("monthly data.csv")
p_frame = frame[frame['product_id'] == 355]
p_frame['order_date'] = pd.to_datetime(p_frame['order_date'])
p_frame = p_frame.sort_values(by='order_date')
p_frame.head()

Unnamed: 0,order_date,product_id,quantities_sold
1891,2019-07-31,355,20.0
1892,2019-08-31,355,26.0
1893,2019-09-30,355,162.0
1894,2019-10-31,355,940.0
1895,2019-11-30,355,1025.0


In [65]:
date = generate_date_range(sc1, p_frame['order_date'].iloc[0], len(p_frame)+2)
p_id = 355

if p_id not in list(X_train.columns) and p_id != 8:
    print('Product is not found')
else:
    row = list()
    predicted_values = list()
    for j in date:
        row.append(j)
        for i in X_train.columns:
            if i == p_id:
                row.append(1)
            elif str(i).isnumeric():
                row.append(0)
                
        arr = np.array(row).reshape(1, -1)
        predicted_values.append(sc2.inverse_transform(np.array(lin_regressor.predict(arr)[0]).reshape(1,-1))[0][0])
        row.clear()

In [None]:
dates = get_date_range(p_frame['order_date'].iloc[0], len(p_frame) + 2)
fig = px.line(x=p_frame['order_date'], y= p_frame['quantities_sold'], title='predicted values for prodcut ' + str(p_id))
fig.add_scatter(x=dates, y=predicted_values, name='predicted') 
fig.update_xaxes(title='date')
fig.update_yaxes(title='quantity')
fig.show()

In [69]:
folds = KFold(n_splits = 50, shuffle = True, random_state = 100)
scores = cross_val_score(lin_regressor, X_train, y_train, scoring='r2', cv=folds)
np.mean(scores)   

0.5762910639200649

* 5 splits ==> 0.59
* 10 splits ==> 0.59
* 50 splits ==> 0.57