In [2]:
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import statsmodels.api as sm

pd.options.display.width = None
warnings.filterwarnings('ignore')
pd.options.display.max_rows = None
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # format numeric outputs

ModuleNotFoundError: No module named 'statsmodels'

In [2]:
def generate_date_range(date_scaler, start_date, periods_num):

    dates = pd.date_range(start = start_date, periods=periods_num, freq='7D')
    dates = pd.to_numeric(dates)
    dates = date_scaler.transform(np.array(dates).reshape(-1,1))

    return dates
  

In [3]:
def get_date_range(start_date, periods_num):

    dates = pd.date_range(start = start_date, periods=periods_num, freq='7D')
    dates = pd.to_datetime(dates)

    return dates

In [4]:
df = pd.read_csv("weekly data.csv")
df['order_date'] = pd.to_datetime(df['order_date'])

In [5]:
frames = list()

for i in df['product_id'].unique():
    frame = df[df['product_id'] == i]
    frame = frame.sort_values(by='order_date')
    frames.append(frame)

df = pd.concat(frames, ignore_index=True)
df.head()

Unnamed: 0,order_date,product_id,quantities_sold,season
0,2019-09-01,8,4.0,Summer
1,2019-09-08,8,3.0,Summer
2,2019-09-15,8,0.0,Summer
3,2019-09-22,8,2.0,Summer
4,2019-09-29,8,2.0,Autumn


In [6]:
print('Number of products = {} products'.format(len(df['product_id'].unique())))

Number of products = 843 products


In [7]:
print('Size of data frame = {} records'.format(len(df)))

Size of data frame = 93400 records


# Calculate correlation with date

In [8]:
cor_list = list()
list_cor = list()
for i in df['product_id'].unique():
    info = dict()
    temp_f = df[df['product_id'] == i]

    temp_f['date'] = pd.to_datetime(temp_f['order_date'])
    temp_f['date'] = pd.to_numeric(temp_f['date'])
    temp_f['target'] = temp_f['quantities_sold']

    temp_f.drop(['order_date', 'product_id', 'season', 'quantities_sold'], axis=1, inplace=True)
    cor = temp_f.corr()

    info['product_id'] = [i]
    info['correlation with date'] = ['{:.2f}'.format(cor['date'][1])]
    
    if np.abs(cor['date'][1]) < 0.50:
        df.drop(df[df['product_id'] == i].index, inplace=True)
        
    list_cor.append(cor['date'][1])
    cor_list.append(pd.DataFrame(info))
pd.concat(cor_list, ignore_index=True).to_csv('correlation with date.csv', index=False)

* We will ignore products that have correlation less then 0.50
* So, the number of products will decrease as there are prodcts have correlation less than 0.20

In [9]:
df['order_date'] = pd.to_numeric(df['order_date'])

### calculate mean and std of date and quantity sold

In [10]:
sc1 = StandardScaler()
sc2 = StandardScaler()

sc1.fit(np.array(df['order_date']).reshape(-1, 1))
sc2.fit(np.array(df['quantities_sold']).reshape(-1, 1))

StandardScaler()

In [11]:
print('Number of products have correlation greater than or equal 0.50 = {} products'.format(len(df['product_id'].unique())))

Number of products have correlation greater than or equal 0.50 = 521 products


In [12]:
print('Size of data frame after cancelling products with no crrelation = {} records'.format(len(df)))

Size of data frame after cancelling products with no crrelation = 58850 records


In [13]:
fig = px.histogram(list_cor, nbins=20,title='Products correlation with date', text_auto=True)
fig.update_layout(bargap=0.4)
fig.update_xaxes(title_text='Correlation')
fig.update_yaxes(title_text='Count')
fig.show()

In [14]:
df.drop('season', axis=1, inplace=True)
df.head()

Unnamed: 0,order_date,product_id,quantities_sold
0,1567296000000000000,8,4.0
1,1567900800000000000,8,3.0
2,1568505600000000000,8,0.0
3,1569110400000000000,8,2.0
4,1569715200000000000,8,2.0


### In random forests, there is no need for cross-validation or a separate test set to get an unbiased estimate of the test set error. It is estimated internally, during the run, as follows: Each tree is constructed using a different bootstrap sample from the original data [no need for test split].

In [15]:
len(df)

58850

In [16]:
temp = sc1.transform(np.array(df['order_date']).reshape(-1,1))
temp = pd.DataFrame(temp)
temp.rename(columns={0:'order_date'}, inplace=True)

dummy1 = pd.get_dummies(df['product_id'])
dummy1.drop(8, axis=1, inplace=True)

new_df = pd.DataFrame()
target = pd.DataFrame(sc2.transform(np.array(df['quantities_sold']).reshape(-1,1)))
target.rename(columns={0:'target'}, inplace=True)
new_df = pd.concat([new_df, temp, dummy1, target], axis=1)

new_df.head()


Unnamed: 0,order_date,10,11,12,15,16,19,22,28,30,...,4369,4371,4391,4394,4395,4396,4398,4409,4415,target
0,-1.89067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.30698
1,-1.86048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.31027
2,-1.83028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.32015
3,-1.80009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.31356
4,-1.76989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.31356


In [17]:
X_train = new_df.drop('target', axis=1)
y_train = new_df['target']

len(X_train), len(y_train)

(76700, 76700)

# Fitting RandomForestRegressor

# iteration 2 (optimal)

In [62]:
regressor = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

In [63]:
regressor.score(X_train, y_train)

0.7609760253652331

In [64]:
y_pred = regressor.predict(X_train)
mean_absolute_error(y_train, y_pred), mean_squared_error(y_train, y_pred)

(0.30299958608702365, 0.23902397463476693)

In [65]:
MAE = sc2.inverse_transform(np.array(mean_absolute_error(y_train, y_pred)).reshape(1, -1))[0][0]
MSE = sc2.inverse_transform(np.array(mean_squared_error(y_train, y_pred)).reshape(1, -1))[0][0]

print('MAE = {}\nMSE = {}'.format(MAE, MSE))


MAE = 189.20941593417166
MSE = 169.78430108768498


# end iteration 2

# iteration 4

In [59]:
regressor = RandomForestRegressor(n_estimators=120, n_jobs=-1, random_state=42)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=120, n_jobs=-1, random_state=42)

In [60]:
regressor.score(X_train, y_train)

0.7661300733560219

In [None]:
y_pred = regressor.predict(X_train)
mean_absolute_error(y_train, y_pred), mean_squared_error(y_train, y_pred)

In [61]:
MAE = sc2.inverse_transform(np.array(mean_absolute_error(y_train, y_pred)).reshape(1, -1))[0][0]
MSE = sc2.inverse_transform(np.array(mean_squared_error(y_train, y_pred)).reshape(1, -1))[0][0]

print('MAE = {}\nMSE = {}'.format(MAE, MSE))

MAE = 189.50804710479434
MSE = 171.2855093619376


# end iteration  4

# iteration 1

In [39]:
regressor = RandomForestRegressor(n_estimators=80, n_jobs=-1, random_state=42)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=80, n_jobs=-1, random_state=42)

In [40]:
regressor.score(X_train, y_train)

0.7580872589994763

In [41]:
y_pred = regressor.predict(X_train)
mean_absolute_error(y_train, y_pred), mean_squared_error(y_train, y_pred)

(0.3032226128672256, 0.24191274100052376)

In [42]:
MAE = sc2.inverse_transform(np.array(mean_absolute_error(y_train, y_pred)).reshape(1, -1))[0][0]
MSE = sc2.inverse_transform(np.array(mean_squared_error(y_train, y_pred)).reshape(1, -1))[0][0]

print('MAE = {}\nMSE = {}'.format(MAE, MSE))



MAE = 189.2771342524639
MSE = 170.66142624750267


# end iteration 1

# iteartion 3

In [55]:
regressor = RandomForestRegressor(n_estimators=70, n_jobs=-1, random_state=42)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=70, n_jobs=-1, random_state=42)

In [56]:
regressor.score(X_train, y_train)

0.7560318735277025

In [57]:
y_pred = regressor.predict(X_train)
mean_absolute_error(y_train, y_pred), mean_squared_error(y_train, y_pred)

(0.3039831124079887, 0.24396812647229751)

In [58]:
MAE = sc2.inverse_transform(np.array(mean_absolute_error(y_train, y_pred)).reshape(1, -1))[0][0]
MSE = sc2.inverse_transform(np.array(mean_squared_error(y_train, y_pred)).reshape(1, -1))[0][0]

print('MAE = {}\nMSE = {}'.format(MAE, MSE))


MAE = 189.50804710479434
MSE = 171.2855093619376


# end iteration 3

In [19]:
def predict_function(model, scaler1, scaler2, product_id):
    frame = pd.read_csv("weekly data.csv")
    p_frame = frame[frame['product_id'] == product_id]
    p_frame['order_date'] = pd.to_datetime(p_frame['order_date'])
    p_frame = p_frame.sort_values(by='order_date')

    date = generate_date_range(scaler1, p_frame['order_date'].iloc[0], len(p_frame) + 4)  # 1 month extra
    p_id = product_id

    if p_id not in list(X_train.columns) and p_id != 8:
        print('Product is not found')
    else:
        row = list()
        predicted_values = list()
        for j in date:
            row.append(j)
            for i in X_train.columns:
                if i == p_id:
                    row.append(1)
                elif str(i).isnumeric():
                    row.append(0)
                    
            arr = np.array(row).reshape(1, -1)
            predicted_values.append(scaler2.inverse_transform(np.array(model.predict(arr)[0]).reshape(1, -1))[0][0])
            row.clear()

        dates = get_date_range(p_frame['order_date'].iloc[0], len(p_frame) + 4)  # 1 month extra
        fig = px.line(x=p_frame['order_date'], y= p_frame['quantities_sold'], title='predicted values for prodcut id ' + str(p_id))
        fig.add_scatter(x=dates, y=predicted_values, name='predicted') 
        fig.update_xaxes(title='date')
        fig.update_yaxes(title='quantity')
        fig.show()

# Save the model and load it 

In [33]:
import pickle
pickle.dump(regressor, open('random_forest_521_products_weekly.pkl', 'wb'))

In [20]:
import pickle
loaded_model = pickle.load(open('random_forest_521_products_weekly.pkl', 'rb'))

In [None]:
for i in range(0, len(X_train.columns), 10):
    print(X_train.columns[i : i + 10])

In [None]:
ids = [15, 16, 355, 2999, 3422, 3705, 8, 3788, 3805, 10]  # products ids to test results

for i in ids:
    predict_function(loaded_model, sc1, sc2, i)

In [None]:
predict_function(regressor, sc1, sc2, 4415)

In [None]:
predict_function(regressor, sc1, sc2, 4271)

In [None]:
predict_function(regressor, sc1, sc2, 4106)

In [None]:
for i in X_train.columns:
    predict_fucntion()

# Using linear regression

In [4]:
df = pd.read_csv("weekly data.csv")
df['order_date'] = pd.to_datetime(df['order_date'])
df.head()

Unnamed: 0,order_date,product_id,quantities_sold,season
0,2021-01-03,8,569.0,Winter
1,2020-04-19,8,94.0,Spring
2,2020-04-12,8,318.0,Spring
3,2020-04-05,8,124.0,Spring
4,2020-03-29,8,156.0,Spring


In [5]:
df['order_date'] = pd.to_numeric(df['order_date'])

sc1 = StandardScaler()
sc2 = StandardScaler()

sc1.fit(np.array(df['order_date']).reshape(-1, 1))
sc2.fit(np.array(df['quantities_sold']).reshape(-1, 1))

StandardScaler()

In [6]:
temp_f = df[df['product_id'] == 8]

temp_f['date'] = pd.to_datetime(temp_f['order_date'])
temp_f['date'] = pd.to_numeric(temp_f['date'])
temp_f['target'] = temp_f['quantities_sold']

temp_f.drop(['order_date', 'product_id', 'season', 'quantities_sold'], axis=1, inplace=True)
temp_f.corr()

Unnamed: 0,date,target
date,1.0,0.86998
target,0.86998,1.0


In [6]:
print('Number of products = {}'.format(len(df['product_id'].unique())))

Number of products = 843


In [7]:
print('Size of data frame = {} records'.format(len(df)))

Size of data frame = 93400 records


In [8]:
cor_list = list()
list_cor = list()
for i in df['product_id'].unique():
    info = dict()
    temp_f = df[df['product_id'] == i]

    temp_f['date'] = pd.to_datetime(temp_f['order_date'])
    temp_f['date'] = pd.to_numeric(temp_f['date'])
    temp_f['target'] = temp_f['quantities_sold']

    temp_f.drop(['order_date', 'product_id', 'season', 'quantities_sold'], axis=1, inplace=True)
    cor = temp_f.corr()

    info['product_id'] = [i]
    info['correlation with date'] = ['{:.2f}'.format(cor['date'][1])]
    
    if np.abs(cor['date'][1]) < 0.50:
        df.drop(df[df['product_id'] == i].index, inplace=True)
        
    list_cor.append(cor['date'][1])
    cor_list.append(pd.DataFrame(info))
pd.concat(cor_list, ignore_index=True).to_csv('correlation with date.csv', index=False)

In [9]:
print('Number of products have correlation greater than or equal 0.50 = {}  products'.format(len(df['product_id'].unique())))

Number of products have correlation greater than or equal 0.50 = 521  products


In [10]:
print('Size of data frame after cancelling products with no crrelation = {} records'.format(len(df)))

Size of data frame after cancelling products with no crrelation = 58850 records


# Take 2 months from from each prodct to be used in testing

In [11]:
test_set = pd.DataFrame()
frames = list()

for i in df['product_id'].unique():
    frame = df[df['product_id'] == i]
    frame = frame.sort_values(by='order_date')
    test_set = pd.concat([test_set, frame.tail(8)], ignore_index=True)
    frame.drop(frame.tail(8).index, inplace=True)
    frames.append(frame)

df = pd.concat(frames, ignore_index=True)
df.head()

Unnamed: 0,order_date,product_id,quantities_sold,season
0,1567296000000000000,8,4.0,Summer
1,1567900800000000000,8,3.0,Summer
2,1568505600000000000,8,0.0,Summer
3,1569110400000000000,8,2.0,Summer
4,1569715200000000000,8,2.0,Autumn


In [12]:
len(df), len(test_set)

(54682, 4168)

In [13]:
df.head()

Unnamed: 0,order_date,product_id,quantities_sold,season
0,1567296000000000000,8,4.0,Summer
1,1567900800000000000,8,3.0,Summer
2,1568505600000000000,8,0.0,Summer
3,1569110400000000000,8,2.0,Summer
4,1569715200000000000,8,2.0,Autumn


In [14]:
temp = sc1.transform(np.array(df['order_date']).reshape(-1, 1))
temp = pd.DataFrame(temp)
temp.rename(columns={0:'order_date'}, inplace=True)


dummy1 = pd.get_dummies(df['product_id'])
dummy1.drop(8, axis=1, inplace=True)

new_df = pd.DataFrame()
new_df = pd.concat([new_df, temp, dummy1], axis=1)
new_df['target'] = sc2.transform(np.array(df['quantities_sold']).reshape(-1, 1))
new_df.head(10)


Unnamed: 0,order_date,10,11,12,15,16,19,22,28,30,...,4369,4371,4391,4394,4395,4396,4398,4409,4415,target
0,-1.92375,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.25044
1,-1.89322,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.2545
2,-1.86268,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.26668
3,-1.83215,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.25856
4,-1.80162,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.25856
5,-1.77108,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.2545
6,-1.74055,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.22201
7,-1.71001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.11237
8,-1.67948,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.22607
9,-1.64894,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.2545


In [15]:
dummy1 = pd.get_dummies(test_set['product_id'])
dummy1.drop(8, axis=1,inplace=True)

temp = sc1.transform(np.array(test_set['order_date']).reshape(-1, 1))
temp = pd.DataFrame(temp)
temp.rename(columns={0:'order_date'}, inplace=True)

new_test_set = pd.DataFrame()
new_test_set = pd.concat([new_test_set, temp, dummy1], axis=1)
new_test_set['target'] = sc2.transform(np.array(test_set['quantities_sold']).reshape(-1,1))

new_test_set.head()

Unnamed: 0,order_date,10,11,12,15,16,19,22,28,30,...,4369,4371,4391,4394,4395,4396,4398,4409,4415,target
0,1.52663,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8.99244
1,1.55716,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.73847
2,1.5877,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8.64319
3,1.61823,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9.00462
4,1.64876,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8.50512


In [16]:
X_train = new_df.drop('target', axis=1)
y_train = new_df['target']

X_test = new_test_set.drop('target', axis=1)
y_test = new_test_set['target']

len(X_train), len(X_test), len(y_train), len(y_test)


(54682, 4168, 54682, 4168)

In [17]:
lin_regressor = LinearRegression()
lin_regressor.fit(X_train, y_train)

LinearRegression()

In [18]:
lin_regressor.score(X_train, y_train)

0.58176949827943

In [None]:
X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

In [20]:
lin_regressor.intercept_

2.2582679668004797

In [None]:
lin_regressor.coef_

In [22]:
y_pred = lin_regressor.predict(X_test)

mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)

(0.6051801907729015, 1.7219824284237664)

In [23]:
MAE = sc2.inverse_transform(np.array(mean_absolute_error(y_test, y_pred)).reshape(1, -1))[0][0]
MSE = sc2.inverse_transform(np.array(mean_squared_error(y_test, y_pred)).reshape(1, -1))[0][0]

print('MAE = {}\nMSE = {}'.format(MAE, MSE))

MAE = 214.6909657491331
MSE = 489.6964220269747


In [24]:
lin_regressor.score(X_test, y_test)

0.559367503533901

In [None]:
error = sc2.inverse_transform(np.array(y_test - y_pred).reshape(1,-1))
temp = list()
for i in error:
    for j in i:
        temp.append(j)

error = temp

fig = px.histogram(error, nbins=20,title='Weekly MAE (Linear regression)', text_auto=True)
fig.update_layout(bargap=0.4)
fig.update_xaxes(title_text='Error')
fig.update_yaxes(title_text='Count')
fig.show()

In [29]:
frame = pd.read_csv("weekly data.csv")
p_frame = frame[frame['product_id'] == 16]
p_frame['order_date'] = pd.to_datetime(p_frame['order_date'])
p_frame = p_frame.sort_values(by='order_date')
p_frame.head()

Unnamed: 0,order_date,product_id,quantities_sold,season
884,2019-09-01,16,49.0,Summer
921,2019-09-08,16,217.0,Summer
920,2019-09-15,16,40.0,Summer
919,2019-09-22,16,14.0,Summer
918,2019-09-29,16,6.0,Autumn


In [30]:
date = generate_date_range(sc1, p_frame['order_date'].iloc[0], len(p_frame)+12)
p_id = 16

if p_id not in list(X_train.columns) and p_id != 8:
    print('Product is not found')
else:
    row = list()
    predicted_values = list()
    for j in date:
        row.append(j)
        for i in X_train.columns:
            if i == p_id:
                row.append(1)
            elif str(i).isnumeric():
                row.append(0)
                
        arr = np.array(row).reshape(1, -1)
        predicted_values.append(sc2.inverse_transform(np.array(lin_regressor.predict(arr)[0]).reshape(1,-1))[0][0])
        row.clear()

In [None]:
dates = get_date_range(p_frame['order_date'].iloc[0], len(p_frame)+12)
fig = px.line(x=p_frame['order_date'], y= p_frame['quantities_sold'], title='predicted values for prodcut ' + str(p_id))
fig.add_scatter(x=dates, y=predicted_values, name='Predicted') 
fig.update_xaxes(title='date')
fig.update_yaxes(title='quantity')
fig.show()

In [102]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
scores = cross_val_score(lin_regressor, X_train, y_train, scoring='r2', cv=folds)
np.mean(scores)   

0.575303519509476

* 5 splits ==> 0.57
* 10 splits ==> 0.57
* 50 splits ==> 0.56