## Demand Prediction using Machine Learning

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import calendar
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.metrics import mean_squared_error,mean_absolute_error , r2_score
import lightgbm as lgb
from sklearn import preprocessing 
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
import seaborn as sns
%matplotlib inline

In [None]:
df_initial = pd.read_pickle('./DemandDataFile', compression='infer')
df_region = pd.read_pickle('./RegionDataFile', compression='infer')
df_initial = pd.merge(df_initial, df_region, how='inner', right_on=['CITY_NAME'], left_on=['CITY'])
df_initial = df_initial.drop(['CITY_NAME'], axis=1)

df_initial = df_initial[~df_initial['PRODUCT_NAME'].str.contains("Small Flyers|Large Flyers|Meter Bubble Wrap|Bundle of 50 Boxes|Wrap", na=False)]
df_initial.rename(columns = {'ORDER_DATE':'DATE'},inplace = True)
df_initial.sort_values('DATE',ascending=True, inplace = True)
df_initial.DATE = pd.to_datetime(df_initial['DATE'])

df_reviews = pd.read_csv('./ProductReviews.csv')
df_initial = pd.merge(df_initial, df_reviews, how='left', right_on=['COD_SKU_CONFIG'], left_on=['SKU'])
df_initial = df_initial.drop(['COD_SKU_CONFIG'], axis=1)

In [None]:
df_fraud = pd.read_csv('./FradulentOrders.csv',dtype={'ORDER_NR': str})

df_initial = df_initial[~df_initial.COD_ORDER_NR.isin(df_fraud.ORDER_NR.tolist())]

df_initial['WareHouse'] = 'Null'
df_initial.loc[:,"WareHouse"][df_initial['REGION_NAME'].isin(['Sindh','Balochistan'])] = 'Karachi'
df_initial.loc[:,"WareHouse"][~df_initial['REGION_NAME'].isin(['Sindh','Balochistan'])] = 'Lahore'
#df_initial['WareHouse'].value_counts()

In [None]:
def isHoliday(x):
    if x in df_hday18.Date.values:
        return 1
    else:
        return 0

In [None]:
df_hday18 = pd.read_csv('./Holidays2018.csv')
df_hday18.Date = pd.to_datetime(df_hday18['Date'])


In [None]:
#df_initial[df_initial.IsHoliday == 1].head()
df_initial.shape

In [None]:
df_initial['MedianPrice'] = df_initial.groupby('SKU')['UNIT_PRICE'].transform('median')
df_initial['MedianPrice'] = pd.to_numeric(df_initial['MedianPrice'])

In [None]:
df_initial['CatConcat'] = df_initial[['PRODUCT_NAME','BRAND_NAME','CATEGORY_LEVEL_1', 'CATEGORY_LEVEL_2','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4']].apply(lambda x: ' | '.join(x.str.strip()), axis=1)
df_initial.shape

In [None]:
def compute_shift(df,dateCol,groupCol):
    df['group_no'] = df.groupby([groupCol]).ngroup()
    tmp = df[[dateCol,'Quantity','group_no']].set_index(['group_no',dateCol])\
                                          .unstack('group_no')\
                                          .resample('D').asfreq()
    tmp1 = tmp.shift(1).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D1')
    tmp2 = tmp.shift(2).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D2')
    tmp3 = tmp.shift(3).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D3')
    tmp4 = tmp.shift(4).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D4')
    tmp5 = tmp.shift(5).fillna(0).astype(int).stack('group_no')['Quantity'].rename('D5')
    
    df = df.join(tmp1, on=[dateCol,'group_no'])
    df = df.join(tmp2, on=[dateCol,'group_no'])
    df = df.join(tmp3, on=[dateCol,'group_no'])
    df = df.join(tmp4, on=[dateCol,'group_no'])
    df = df.join(tmp5, on=[dateCol,'group_no'])
    
    df.drop(axis=1, columns=['group_no'], inplace = True)
    return df

In [None]:
def log_inf(x):
    if x>0:
        return np.log1p(x) 
    else:
        return np.log1p(0) 

def is_bundle(x):
    if 'Bundle' in x or 'Pack' in x or '+' in x:
        return 1
    else:
        return 0


In [None]:
def product_Gender(x):
    if 'Men' in x:
        return 'MEN'
    elif 'Woman' in x or 'Jewellery' in x:
        return 'WOMAN'
    else:
        return 'NEUTRAL'
    
def is_Grocery(x):
    if 'Grocer' in x:
        return 1
    else:
        return 0

def is_Baby(x):
    if 'Baby' in x:
        return 1
    else:
        return 0    

def is_PrevWeekHoliday(x):
    if len(tempHolidayWeek[tempHolidayWeek  == x].values) > 0 :
        return True
    else:
        return False

In [None]:
import time
def prepareDataFrame(wareHouse):
    train_df = df_initial[['SKU','DATE','WareHouse','Quantity','MedianPrice','PRODUCT_NAME','CatConcat','CATEGORY_LEVEL_1','CATEGORY_LEVEL_2','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4','BRAND_NAME']][df_initial.WareHouse == wareHouse]
    
    
    train_df['IsBundle'] = train_df['PRODUCT_NAME'].map(is_bundle)
    train_df['ProductGender'] = train_df['CatConcat'].map(product_Gender)
    train_df['IsGrocery'] = train_df['CatConcat'].map(is_Grocery)
    train_df['IsBaby'] = train_df['CatConcat'].map(is_Baby)
    
    #train_df = train_df[(train_df.SKU == 'HP770OT03D0JKNAFAMZ') | (train_df.SKU == 'SH069FA039PJONAFAMZ')]
    train_df = train_df.groupby(by=['SKU','DATE','WareHouse','MedianPrice','IsBundle','IsGrocery','IsBaby','ProductGender','PRODUCT_NAME','CatConcat','CATEGORY_LEVEL_1','CATEGORY_LEVEL_2','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4','BRAND_NAME'], as_index=False)['Quantity'].sum()
    train_df.sort_values('DATE',ascending=True, inplace = True)
    train_df.DATE = pd.to_datetime(train_df['DATE'])
    train_df = train_df.set_index('DATE')

    
    #Gettign the SKUs whcih were not demanded on the start date 
    startDate = '2017-12-01'
    temp = train_df.reset_index().groupby('SKU').first()
    temp.drop(temp[temp.DATE == startDate].index, inplace=True)

    # replacing date to the Min Start date & Quantity Demand to None
    temp['DATE'] = pd.to_datetime(startDate)
    temp['Quantity'] = 0
    if temp.index.name == 'SKU':
        temp.reset_index(inplace = True)
    
    temp = temp.set_index('DATE')
    
    train_df = train_df.append(temp)
    train_df.reset_index(inplace=True)
    train_df['WEEKDAY'] = train_df['DATE'].apply(lambda x:calendar.day_name[x.weekday()])
    train_df['MONTH'] = train_df['DATE'].apply(lambda x:calendar.month_abbr[x.month])
    train_df['IsHoliday'] = [isHoliday(x) for x in train_df['DATE'].values]
    
    train_df['YEAR'] = train_df['DATE'].apply(lambda x:x.year)
    train_df['YEAR'] = train_df['YEAR'].apply(str)
    
    train_df['WEEKNO'] = train_df['DATE'].dt.week
    #train_df['WEEKNO'] = train_df['WEEKNO'].apply(str)
        
    
   
    train_df = train_df.groupby(by=['SKU','YEAR','WEEKNO','WareHouse','MedianPrice','IsGrocery','IsBaby','IsBundle','ProductGender','PRODUCT_NAME','CatConcat','CATEGORY_LEVEL_1','CATEGORY_LEVEL_2','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4','BRAND_NAME'], as_index=False)['Quantity','IsHoliday'].sum()
   
    return train_df

In [None]:
tempKhi = prepareDataFrame('Karachi')
tempKhi = tempKhi.sort_values(by=['SKU','YEAR', 'WEEKNO'], ascending=True)
tempKhi['WEEKNO'] = tempKhi['WEEKNO'].apply(int)
tempKhi.fillna(0, inplace=True)
print(tempKhi.shape)
#train_df = temp.copy()

tempLhr = prepareDataFrame('Lahore')
tempLhr = tempLhr.sort_values(by=['SKU','YEAR', 'WEEKNO'], ascending=True)
tempLhr['WEEKNO'] = tempLhr['WEEKNO'].apply(int)
tempLhr.fillna(0, inplace=True)
print(tempLhr.shape)

In [None]:
tempLhr['Last_Week_Sales'] = tempLhr.groupby(['SKU'])['Quantity'].shift()[(tempLhr.WEEKNO == tempLhr.WEEKNO.shift() + 1)]
tempLhr['Last_Week_Diff'] = tempLhr.groupby(['SKU'])['Last_Week_Sales'].diff(1)[(tempLhr.WEEKNO == tempLhr.WEEKNO.shift() + 1)]

tempLhr['Last_2Week_Sales'] = tempLhr.groupby(['SKU'])['Quantity'].shift(2)[(tempLhr.WEEKNO == tempLhr.WEEKNO.shift(1) + 1)]
#tempLhr['Last_2Week_Diff'] = tempLhr.groupby(['SKU'])['Last_2Week_Sales'].diff()[(tempLhr.WEEKNO == tempLhr.WEEKNO.shift(2) + 1)]

#tempLhr[(tempLhr.SKU == '00301FA025DPKNAFAMZ')][['YEAR','WEEKNO','Quantity','Last_Week_Sales','Last_2Week_Sales']]

In [None]:
tempKhi['Last_Week_Sales'] = tempKhi.groupby(['SKU'])['Quantity'].shift()[(tempKhi.WEEKNO == tempKhi.WEEKNO.shift() + 1)]
tempKhi['Last_Week_Diff'] = tempKhi.groupby(['SKU'])['Last_Week_Sales'].diff()[(tempKhi.WEEKNO == tempKhi.WEEKNO.shift() + 1)]

tempKhi['Last_2Week_Sales'] = tempKhi.groupby(['SKU'])['Quantity'].shift(2)[(tempKhi.WEEKNO == tempKhi.WEEKNO.shift(1) + 1)]
#tempKhi['Last_2Week_Diff'] = tempKhi.groupby(['SKU'])['Last_2Week_Sales'].diff()[(tempKhi.WEEKNO == tempKhi.WEEKNO.shift(2) + 1)]

#tempKhi[(tempKhi.SKU == '00301FA025DPKNAFAMZ')]


In [None]:
temp = pd.concat([tempKhi, tempLhr])
tempHolidayWeek = temp[temp.IsHoliday == 1]['WEEKNO'].unique()

tempHolidayWeek = (pd.Series(tempHolidayWeek + 1))
temp['PrevWeekHoliday'] = temp.WEEKNO.apply(lambda x : len(tempHolidayWeek[tempHolidayWeek  == x].values) > 0)

#since we added 1 above so we are subtracting 2
tempHolidayWeek = (pd.Series(tempHolidayWeek - 2))
temp['NextWeekHoliday'] = temp.WEEKNO.apply(lambda x : len(tempHolidayWeek[tempHolidayWeek  == x].values) > 0)

In [None]:
temp.fillna(0, inplace=True)
#temp[temp.SKU == '00301FA025DPKNAFAMZ']
temp.head()

In [None]:
#train_df[train_df['Quantity'] > 500].PRODUCT_NAME.value_counts()

### Visualization

In [None]:
test = df_initial.groupby(by=['DATE','SKU','CATEGORY_LEVEL_1'], as_index=False)['Quantity'].sum()

In [None]:
   
g = sns.FacetGrid(test[test.Quantity < 1000], col="CATEGORY_LEVEL_1")
g = g.map(plt.hist, "Quantity", log=True)

In [None]:
fig, ax = plt.subplots(figsize=(13, 6))

#bins = np.arange(0,60,5) , use bins=bins in hist function below for smaller values
pd.DataFrame(Y).hist(ax=ax, bottom=0.1)

### BenchMark Model (Predict Demand as Avergae of last N days demand)

In [None]:
test_df = train_df[(train_df.DATE >= '2018-05-01') & (train_df.Quantity <= 100)]

In [None]:
test_df['PredictedDemand'] = np.int64((test_df.D1+test_df.D2+test_df.D3+test_df.D4+test_df.D5)/5)
#test_df.loc[:,'PredictedDemand'] = test_df['PredictedDemand'].apply(lambda x : log_inf(x))
test_df.head()

In [None]:
import math
print("MSE: ",mean_squared_error(test_df.Quantity, test_df.PredictedDemand),
      "RMSE: ",math.sqrt(mean_squared_error(test_df.Quantity, test_df.PredictedDemand))
     )


### ML model Data Prepration

In [None]:
#train_df.drop(axis=1, columns=['Karachi','Lahore'], inplace = True)
if not {'MEN', 'NEUTRAL','Karachi','Lahore'}.issubset(temp.columns):
    dummyWareHouse = pd.get_dummies(temp['WareHouse']).astype(int)
    dummyProductGender = pd.get_dummies(temp['ProductGender']).astype(int)
    temp = pd.concat([temp,dummyWareHouse], axis = 1)
    temp = pd.concat([temp,dummyProductGender], axis = 1)
    temp.PrevWeekHoliday = temp.PrevWeekHoliday.astype(int)
    temp.NextWeekHoliday = temp.NextWeekHoliday.astype(int)


In [2]:
#temp.to_pickle('./DemandForecastData',compression='infer', protocol=4)
temp = pd.read_pickle('./DemandForecastData', compression='infer')
temp.head()

Unnamed: 0,SKU,YEAR,WEEKNO,WareHouse,MedianPrice,IsGrocery,IsBaby,IsBundle,ProductGender,PRODUCT_NAME,...,Last_Week_Sales,Last_Week_Diff,Last_2Week_Sales,PrevWeekHoliday,NextWeekHoliday,Karachi,Lahore,MEN,NEUTRAL,WOMAN
0,00301FA025DPKNAFAMZ,2017,48,Karachi,399.0,0,0,0,MEN,Unisex Style Baseball Cap - Black,...,0.0,0.0,0.0,0,0,1,0,1,0,0
1,00301FA025DPKNAFAMZ,2018,18,Karachi,399.0,0,0,0,MEN,Unisex Style Baseball Cap - Black,...,0.0,0.0,0.0,0,0,1,0,1,0,0
2,00301FA025DPKNAFAMZ,2018,19,Karachi,399.0,0,0,0,MEN,Unisex Style Baseball Cap - Black,...,1.0,0.0,0.0,1,0,1,0,1,0,0
3,00301FA025DPKNAFAMZ,2018,21,Karachi,399.0,0,0,0,MEN,Unisex Style Baseball Cap - Black,...,0.0,0.0,0.0,0,0,1,0,1,0,0
4,00301FA0QSN4YNAFAMZ,2017,48,Karachi,600.0,0,0,0,NEUTRAL,Blue Golden Tulip Brooch For Women,...,0.0,0.0,0.0,0,0,1,0,0,1,0


In [None]:
brandTable = pd.DataFrame(temp.BRAND_NAME.value_counts())
brandTable[brandTable.BRAND_NAME > np.percentile(brandTable.BRAND_NAME,90)].sum()

#np.percentile(brandTable.BRAND_NAME,90)

In [4]:
temp.shape

(1835047, 28)

### Train Test Data Split

In [None]:
prodIds = pd.DataFrame(temp.SKU.unique()).reset_index()
prodIds.rename(columns = {'index':'ProdId',0:'SKU'},inplace = True)

In [49]:
colList = ['WEEKNO','SKU','Lahore','NEUTRAL','IsHoliday',
           
           'Last_Week_Sales','Last_2Week_Sales','MedianPrice',
           'BRAND_NAME','Quantity']
toExcludeCols = ['SKU','Quantity','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4','BRAND_NAME']

toExcludeWeeks = [48,49,50,51,52]
testWeeks = [19,20,21,22]
#train_df = temp[temp.Quantity <= np.percentile(train_df.Quantity,99)]

train_df = temp.copy()
#train_df = train_df[train_df.Lahore == 1]
#train_df.loc[train_df.IsGrocery > 0,'IsGrocery'] = 100
train_df['%UpDown'] = (((train_df.Last_Week_Sales + 1) - (train_df.Last_2Week_Sales + 1)) / (train_df.Last_2Week_Sales + 1)) * 100
train_df.loc[train_df.MedianPrice > np.percentile(train_df.MedianPrice,95) ,'MedianPrice'] = np.percentile(train_df.MedianPrice,95)
train_df.loc[train_df.Quantity >= np.percentile(train_df.Quantity,99.9),'Quantity'] = np.percentile(train_df.Quantity,99.9)

X = train_df[(~train_df.WEEKNO.isin(testWeeks) & ~train_df.WEEKNO.isin(toExcludeWeeks))][colList].copy()

#X.MedianPrice = X.MedianPrice.map(log_inf)
#Y_orig = train_df[~train_df.WEEKNO.isin(testWeeks)][['Quantity']]
#Y = Y_orig.Quantity.map(log_inf)

#Adding Avg Quantity DEmanded during the period of study

X['LastDemand'] = X.groupby(by=['SKU','Lahore'])['Quantity'].transform('last')

#X['LastCat3Demand'] = X[['CATEGORY_LEVEL_3','CATEGORY_LEVEL_4','BRAND_NAME']].apply(lambda x: ' | '.join(x.str.strip()), axis=1)
#X['LastCat3Demand'] = X.groupby(by=['Lahore','LastCat3Demand'])['Quantity'].transform('mean')


# tt = X[['WEEKNO','SKU','Lahore','Quantity']]
# tt['RollAvg']  = tt.groupby(by=['SKU','Lahore'])['Quantity'].rolling(3).mean().reset_index()['Quantity'].values

# col_to_use = tt.columns.difference(X.columns)
# X = pd.merge(X,tt[col_to_use], how='inner', right_on=['SKU','WEEKNO','Lahore'], left_on=['SKU','WEEKNO','Lahore'])
#X = pd.merge(X,prodIds, how='inner', right_on=['SKU'], left_on=['SKU'])

#X[X.SKU == '00301FA025DPKNAFAMZ'].head(100)
#X[X.Lahore == 1].SKU.value_counts()


In [50]:
#Data from training sample to add average demand into test data with no data leakage
tempAvgDemand = X[['SKU','Lahore','LastDemand']].drop_duplicates().copy()
#tempProdId = X[['SKU','ProdId']].drop_duplicates().copy()

X_test = train_df[train_df.WEEKNO.isin(testWeeks)][colList]

X_test = pd.merge(X_test,tempAvgDemand, how='inner', right_on=['SKU','Lahore'], left_on=['SKU','Lahore'])

#X_test.MedianPrice = X_test.MedianPrice.map(log_inf)
#Y_test_orig = train_df[train_df.WEEKNO.isin(testWeeks)][['Quantity']]
#Y_test = Y_test_orig.Quantity.map(log_inf)

#X_test = pd.merge(X_test,tempProdId, how='inner', right_on=['SKU'], left_on=['SKU'])


In [None]:
#SCALING 
scaleColList = ['MedianPrice','LastDemand','LastCat3Demand','Last_Week_Sales','Last_2Week_Sales','LastDemand','%UpDown']

scalerX = preprocessing.MinMaxScaler().fit(X[scaleColList])
scalerY = preprocessing.MinMaxScaler().fit(X['Quantity'].values.reshape(-1, 1))
X[scaleColList] = scalerX.transform(X[scaleColList])
Y = scalerY.transform(X['Quantity'].values.reshape(-1, 1))
X_test[scaleColList] = scalerX.transform(X_test[scaleColList])
Y_test = scalerY.transform(X_test['Quantity'].values.reshape(-1, 1))

#X_test = X_test[X_test.columns.difference(['Quantity','SKU','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4'])]
#X = X[X.columns.difference(['SKU','Quantity','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4'])]

In [None]:
X[scaleColList] = X[scaleColList].applymap(log_inf)
X_test[scaleColList] = X_test[scaleColList].applymap(log_inf)

In [None]:
# X = tX
# X_test = tXs
#tX = X.copy()
#tXs = X_test.copy()

In [51]:
# X = X[X.Lahore == 0]
# X_test = X_test[X_test.Lahore == 0]


Y = X.loc[:,'Quantity']#.map(log_inf)
Y_test = X_test.loc[:,'Quantity']#.map(log_inf).copy()


#REMOVE Not Required Columns from both Train/Test Sets

X_test = X_test[X_test.columns.difference(['Quantity','SKU','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4','BRAND_NAME'])]
X = X[X.columns.difference(['SKU','Quantity','CATEGORY_LEVEL_3','CATEGORY_LEVEL_4','BRAND_NAME'])]

In [52]:
#temp = Y_test
#Y_test = temp


X =      X[X.columns.difference(['SKU','PrevWeekHoliday','NextWeekHoliday','LastDemand'
           , 'IsBaby', 'IsBundle','IsGrocery','%UpDown'])]
X_test = X_test[X_test.columns.difference(['SKU','PrevWeekHoliday','NextWeekHoliday','LastDemand'
           , 'IsBaby', 'IsBundle','IsGrocery','%UpDown'])]

In [53]:
X.dtypes

IsHoliday             int64
Lahore                int64
Last_2Week_Sales    float64
Last_Week_Sales     float64
MedianPrice         float64
NEUTRAL               int64
WEEKNO                int64
dtype: object

### Decision Tree Regression Model

In [54]:
from sklearn.tree import DecisionTreeRegressor
regr = DecisionTreeRegressor(criterion='mse', max_depth=14, max_features='auto',
           max_leaf_nodes=30, min_impurity_decrease=0,
           min_impurity_split=None, min_samples_leaf=15,
           min_samples_split=12, min_weight_fraction_leaf=0,
           presort=False, random_state=1, splitter='best')

regr.fit(X, Y)
y_pred = regr.predict(X_test)

import math

# y_pred = scalerY.inverse_transform(y_pred.reshape(-1,1)).copy()
# Y_test = scalerY.inverse_transform(Y_test.reshape(-1,1)).copy()

print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred))),
      "R2:", r2_score(Y_test,y_pred)
     )
# print("MSE: ",mean_squared_error((np.expm1(Y_test)), np.expm1((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.expm1((Y_test)), np.expm1((y_pred)))),
#       "R2:", r2_score(np.exp(Y_test),np.expm1(y_pred))
#       )

MSE:  47.88585999495875 RMSE:  6.919960982184708 R2: 0.36449567918707826


In [55]:
from sklearn.externals import joblib
joblib.dump(regr, 'model.pkl')

['model.pkl']

In [44]:
hold = X_test.copy()

hold['Predicted'] = y_pred
hold['Actual'] = Y_test

print('Total Actual Demand : ',hold.Actual.sum(),
      '\nTotal Predicted Demand : ',hold.Predicted.sum())

#hold = tXs[['CATEGORY_LEVEL_3','CATEGORY_LEVEL_4','BRAND_NAME',
#            'Last_2Week_Sales','Last_Week_Sales']].join(hold[['WEEKNO','Actual','Predicted']])
hold[hold.Actual > 10].head(25)

Total Actual Demand :  403926.0 
Total Predicted Demand :  403457.9783083803


Unnamed: 0,IsHoliday,Lahore,Last_2Week_Sales,Last_Week_Sales,NEUTRAL,WEEKNO,Predicted,Actual
78,0,0,0.0,0.0,1,21,1.682478,66.0
216,0,0,6.0,9.0,1,21,8.177101,21.0
220,0,0,9.0,9.0,1,21,8.177101,11.0
230,0,0,5.0,9.0,1,20,8.177101,18.0
231,0,0,9.0,18.0,1,21,15.242228,44.0
232,0,0,18.0,44.0,1,22,27.888078,11.0
268,0,0,0.0,0.0,1,19,1.682478,11.0
366,0,0,2.0,6.0,1,19,3.394696,16.0
557,0,0,0.0,0.0,1,20,1.682478,35.0
558,0,0,1.0,35.0,1,21,27.888078,68.0


In [None]:
hold[hold.Actual == hold.Actual.max()]

In [56]:
from sklearn.tree import export_graphviz
import os
import subprocess

def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

In [57]:

visualize_tree(regr, X.columns)

In [None]:

main_list = Diff((X_test.SKU.unique()),X.SKU.unique())
print(len(main_list))
print("Train SKU",len(X.SKU.unique()),"Test SKU",len(X_test.SKU.unique()))


def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 
  
# Driver Code 
li1 = [10,25, 40, 35,100,200,300,400,500] 
li2 = [10, 15, 20, 25, 30, 35] 

print(Diff(li2, li1)) 

In [None]:
xgbreg.best_params_

### XGBOOST Model

In [None]:
from xgboost import XGBRegressor

from xgboost.sklearn import XGBRegressor  
import scipy.stats as st


params = {  
    "n_estimators": [i for i in range(4,15)],
    "max_depth": [i for i in range(2,12)],
    "learning_rate": [i/20 for i in range(2,20)],
    "colsample_bytree": [i/10 for i in range(3,10)],
    "subsample": [i/10 for i in range(3,10)],
    "gamma": [i/10.0 for i in range(1,10)],
    'reg_alpha': [i/10 for i in range(1,10)],
    "min_child_weight": [i/10 for i in range(1,10)],
    "silent": [False,True]
}

xgbreg = XGBRegressor() 
xgbreg = RandomizedSearchCV(xgbreg, param_distributions=params,scoring='r2',n_iter=5,verbose=4,n_jobs=4)
xgbreg.fit(X, Y)

y_pred = xgbreg.best_estimator_.predict(X_test)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred))),
      "R2:", r2_score(Y_test,y_pred)
     )
# print("MSE: ",mean_squared_error((np.exp(Y_test)), np.exp((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.exp((Y_test)), np.exp((y_pred))))
#      )
#print (xgbreg)

In [None]:
%matplotlib inline
from xgboost import plot_tree
from matplotlib.pylab import rcParams

#set up the parameters
plot_tree(xgbreg._Booster, num_trees=0, rankdir='LR')
rcParams['figure.figsize'] = 80,50
fig = plt.gcf()
fig.set_size_inches(150, 100)
fig.savefig('tree.png')


### POLYNOMIAL REGRESSION

In [None]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression

# # create a Linear Regressor   
# lin_regressor = LinearRegression()

# # pass the order of your polynomial here  
# poly = PolynomialFeatures(2)

# # convert to be used further to linear regression
# X_transform = poly.fit_transform(X.Last_Week_Sales.values.reshape(-1, 1))

# # fit this to Linear Regressor
# lin_regressor.fit(X_transform,Y.values.reshape(-1,1)) 

# # get the predictions
# y_pred = lin_regressor.predict(X_test.Last_Week_Sales.values.reshape(-1, 1))

# import math
# print("MSE: ",mean_squared_error((Y_test), (y_pred)),
#       "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred)))
#      )


### GridSearch For Best Model 

In [None]:

gs = RandomizedSearchCV(xgbreg, params, n_jobs=1)  
gs.fit(X, Y)


In [None]:
#gs.best_estimator_

In [None]:
y_pred = gs.best_estimator_.predict(X_test)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred)))
     )
# print("MSE: ",mean_squared_error((np.exp(Y_test)), np.exp((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.exp((Y_test)), np.exp((y_pred))))
#      )

### LGBM MODEL

In [None]:
params = {
    'num_leaves': 50,
    'objective': 'regression_l2',
    'max_depth': 12,
    'min_data_in_leaf': 20,
    'learning_rate': 0.001,
    'feature_fraction': 0.77,
    'bagging_fraction': 0.77,
    'bagging_freq': 3,
    'metric': 'l2',
    'num_threads': 4
}
MAX_ROUNDS = 1000

lgb_train = lgb.Dataset(X, Y)
lgb_test = lgb.Dataset(X_test, Y_test, reference=lgb_train)

gbm = lgb.train(
       params, lgb_train, num_boost_round=MAX_ROUNDS,
       valid_sets=lgb_test, early_stopping_rounds=50, verbose_eval=50
   )
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred)))
     )
# print("MSE: ",mean_squared_error((np.exp(Y_test)), np.exp((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.exp((Y_test)), np.exp((y_pred))))
#      )

### Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=50,
           min_impurity_decrease=0.001, min_impurity_split=None,
           min_samples_leaf=20, min_samples_split=10,
           min_weight_fraction_leaf=0.001, n_estimators=10, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

forest.fit(X, Y)

y_pred = forest.predict(X_test)

import math
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred))),
      "R2:", r2_score(Y_test,y_pred)
     )
# print("MSE: ",mean_squared_error((np.exp(Y_test)), np.exp((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error(np.exp((Y_test)), np.exp((y_pred))))
#      )

In [None]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(4,6,2),
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=200,
                                       learning_rate_init=0.0001,
                                       alpha=0.02,
                                       batch_size = 400,
                                       verbose=True)
mlp.fit(Xt, Y)

y_pred = mlp.predict(Xt_test)
import math

# y_pred1 = scalerY.inverse_transform(y_pred.reshape(-1,1)).copy()
# Y_test1 = scalerY.inverse_transform(Y_test.reshape(-1,1)).copy()

print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred))),
      "R2:", r2_score(Y_test,y_pred)
     )

In [None]:
# from sklearn import linear_model
# # sgd = linear_model.SGDRegressor(alpha=0.0001, average=False, epsilon=1, eta0=0.00001,
# #        fit_intercept=True, l1_ratio=0.001, learning_rate='invscaling',
# #        loss='huber', max_iter=None, n_iter=300, penalty='l2',
# #        power_t=0.000015, random_state=None, shuffle=True, tol=None,
# #        verbose=0, warm_start=True)
# # sgd.fit(X, Y)
# # y_pred = sgd.predict(X_test)
# params = {  
#     "alpha": [i/1000 for i in range(1,100)],
#     "epsilon": [i/1000 for i in range(1,100)],
#     "eta0": [i/1000 for i in range(1,100)],
#     "l1_ratio": [i/1000 for i in range(1,100)],
#     "learning_rate": ['invscaling','optimal'],
#     "loss": ['huber','squared_loss'],
#     'max_iter': [i for i in range(50,500)],
#     "power_t": [i/1000 for i in range(1,100)],
#     "random_state": [1001]
# }

# sgd = linear_model.SGDRegressor()
# sgd = RandomizedSearchCV(sgd, param_distributions=params,scoring='r2',n_iter=10000,verbose=100,n_jobs=4)
# sgd.fit(X, Y)

# y_pred = sgd.best_estimator_.predict(X_test)

# import math

# y_pred1 = scalerY.inverse_transform(y_pred.reshape(-1,1)).copy()
# Y_test1 = scalerY.inverse_transform(Y_test.reshape(-1,1)).copy()

# print("MSE: ",mean_squared_error((Y_test1), (y_pred1)),
#       "RMSE: ",math.sqrt(mean_squared_error((Y_test1), (y_pred1))),
#       "R2:", r2_score(Y_test1,y_pred1)
#      )

In [None]:
hold = X_test.copy()

hold['Predicted'] = y_pred
hold['Actual'] = Y_test

print('Total Actual Demand : ',hold.Actual.sum(),
      '\nTotal Predicted Demand : ',hold.Predicted.sum())

hold = X_test[X_test.columns].join(hold[['Actual','Predicted']])
hold[hold.Actual > 6].head(25)

In [None]:
from sklearn import linear_model
reg = linear_model.Ridge (alpha=0.213, copy_X=True, fit_intercept=True, max_iter=500,
      normalize=True, random_state=1, solver='lsqr', tol=0.001)
reg.fit(X,Y)

y_pred = reg.predict(X_test)
import math

# y_pred1 = scalerY.inverse_transform(y_pred.reshape(-1,1)).copy()
# Y_test1 = scalerY.inverse_transform(Y_test.reshape(-1,1)).copy()
print("MSE: ",mean_squared_error((Y_test), (y_pred)),
      "RMSE: ",math.sqrt(mean_squared_error((Y_test), (y_pred))),
      "R2:", r2_score(Y_test,y_pred)
     )

# print("MSE: ",mean_squared_error((np.expm1(Y_test)), np.expm1((y_pred))),
#       "RMSE: ",math.sqrt(mean_squared_error((np.expm1(Y_test)), np.expm1((y_pred)))),
#       "R2:", r2_score(np.expm1(Y_test),np.expm1(y_pred))
#      )

print(reg.coef_)
print(reg.intercept_)

In [11]:
import spacy
nlp = spacy.load('en')

docs = temp['CatConcat'].head().tolist()

def token_filter(token):
    
    if str(token).isdigit():
        return False
    else:
        return not (token.is_punct | token.is_space | token.is_stop | len(token.text) <= 3)

filtered_tokens = []
for doc in nlp.pipe(docs):
    tokens = [token.lemma_ for token in doc if token_filter(token)]
    filtered_tokens.append(tokens)

In [4]:
import os
if os.path.exists('./productCategoryStemmedDataFile'):
    productCategoryStemmed = pd.read_pickle('./productCategoryStemmedDataFile', compression='infer')
    productCategoryStemmed.columns = ['SKU','PRODUCT_STEMMED']
    productCategoryStemmed['PRODUCT_STEMMED'] = pd.DataFrame(productCategoryStemmed)['PRODUCT_STEMMED'].apply(lambda x: x.split('-*-')[1])
    temp = pd.merge(temp, productCategoryStemmed, how='inner')
    print('File Found')
    
else:       
    #docs = temp['CatConcat'].head(1000).values
    def normalize(docs):
        filtered_tokens = []
        for doc in nlp.pipe(docs):
            key = doc.text.split('-*-')[0]
            #doc = doc.map(str).split('-*-')[1]
            tokens = [token.lemma_ for token in doc if token_filter(token)]
            filtered_tokens.append((key," ".join(tokens)))
        return filtered_tokens

    %time test = temp['SKU'].map(str)+'-*-'+temp['CatConcat'].map(str)
    %time productCategoryStemmed = normalize(test.drop_duplicates().tolist())
    pd.DataFrame(productCategoryStemmed).to_pickle('./productCategoryStemmedDataFile',compression='infer', protocol=4)
   

File Found


In [None]:
#nlp.vocab['game'].similarity(nlp.vocab['game'])
from spacy.lang.en import English
docs = temp['PRODUCT_STEMMED'].head(5).tolist()
parser = English()
text1 = "I like statements that are both true and absurd."
filtered_tokens = []
def getTokens(docs):
    for doc in nlp.pipe(docs): 
        tokens = [token.lemma_ for token in doc]
        filtered_tokens.append(tokens)
        #for data in doc:
    yield filtered_tokens
        
#print ([data for data in getTokens(docs)])
# tokens = parser(text1)
# tokens = [token.orth_ for token in tokens if token_filter(token)]
# print(tokens)

In [77]:
def most_similar(word):
    queries = [w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return by_similarity[:10]
 
[w.lower_ for w in most_similar(nlp.vocab['cricket'])]

[]

In [89]:
import gensim
import nltk
# WORD2VEC
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            #print('arr : ',array)
            for document in nlp.pipe(array):
                #print('doc : ',type(document))
                tokens = [token.lemma_ for token in document]
                yield tokens
                
def get_word2vec(sentences, location):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    import os
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=400, window=4, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

w2vec = get_word2vec(MySentences(temp['PRODUCT_STEMMED'].unique()), './w2vmodel')

./w2vmodel not found. training model
Model done training. Saving to disk


In [73]:
from spacy.tokenizer import Tokenizer
import spacy
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe(nlp.create_pipe('sentencizer'))
tokenizer = Tokenizer(nlp.vocab)
for doc in tokenizer.pipe(temp['PRODUCT_STEMMED'].head(), batch_size=50):
    for token.lemma_ in doc:
        print(token.lemma_)

#ss = temp['PRODUCT_STEMMED'].head().apply(lambda x: nlp.tokenizer(x))

In [173]:
w2vec.most_similar(positive=['18-k'], topn=10)

  """Entry point for launching an IPython kernel.


[('24-k', 0.9723449945449829),
 ('1k', 0.9183204174041748),
 ('18k', 0.9137002229690552),
 ('boondi', 0.9135420918464661),
 ('14-k', 0.9078975319862366),
 ('samee', 0.8863838315010071),
 ('simulat', 0.8863443732261658),
 ('rscw-5500', 0.8844240307807922),
 ('beneta', 0.8781148195266724),
 ('milka', 0.8773046731948853)]

In [133]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros`
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [134]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2vec)
mean_embedded = mean_embedding_vectorizer.fit_transform(X['PRODUCT_STEMMED'])

mean_embedded_test = mean_embedding_vectorizer.transform(X_test['PRODUCT_STEMMED'])



In [177]:
words = w2vec.wv.index2word
wvs = w2vec.wv[words]
len(words)

15795

In [189]:
Xx = MyTokenizer().fit_transform(X['PRODUCT_STEMMED'].head())
dd = np.array([np.mean([w2vec.wv[w] for w in words if w in w2vec.wv]
                    or [np.zeros(self.dim)], axis=0) for words in Xx]) 


In [222]:
Xt = pd.concat([X.reset_index(drop=True),pd.DataFrame(mean_embedded).reset_index(drop=True)],axis=1)
Xt_test = pd.concat([X_test.reset_index(drop=True),pd.DataFrame(mean_embedded_test).reset_index(drop=True)],axis=1)


(903442,)