In [2]:
import warnings
import itertools
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
import pandas as pd
import statsmodels.api as sm
from pytrends.request import TrendReq
import pytrends as pt
import matplotlib
import json
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
matplotlib.rcParams['text.color'] = 'G'

#%matplotlib notebook

In [3]:
#This functions returns a list of 4 attributes which have the maximum trend score on google trends when queried 5 at a time.
def max4(cat):
    att = [0,1,2,3,4]
    tr_lst = []
    if(len(cat) < 5):
        return range(0,len(cat))
    elif(len(cat) == 5):
        i_stack = []
        tr_lst = tr_scr_mn(cat,att)
    else:
        i_stack = list(range(5,len(cat)))
    
    while(len(i_stack) > 0):
        tr_lst = tr_scr_mn(cat,att)
        min_i = np.argmin(tr_lst)
        att.remove(att[min_i])
        att.append(i_stack.pop())
    
    min_i = np.argmin(tr_lst)
    att.remove(att[min_i])
    return att

#This function returns the mean trend score of attributes of a category.
#att = list of indices of attributes.
#cat = list of attributes.
def tr_scr_mn(cat,att):
    trend = TrendReq(hl='en-US', tz=360)
    field = [cat[i] for i in att]
    kw_list = field
    trend.build_payload(kw_list, cat=search_domains['Apparel'], timeframe='today 5-y', geo='', gprop='')
    df = trend.interest_over_time()
    return list(df.mean())[:-1]

#This function returns a Dataframe() with Trend score data from Pytrends for 5 attributes of a category.
def tr_scr_df(cat,att):
    trend = TrendReq(hl='en-US', tz=360)
    field = [cat[i] for i in att]
    kw_list = field
    trend.build_payload(kw_list, cat=search_domains['Apparel'], timeframe='today 5-y', geo='', gprop='')
    df = trend.interest_over_time()
    return df


###This function returns a list of Dataframes containing trend scores of last 5 years of all the non max attributes with respect to max 4 attributes.###
def tr_scr_nmx_df(cat,max_4):
    fab_scr = []
    max4_ = set(max_4)
    all_c = set(range(0,len(cat)))
    nmx = all_c - max4_
    for i in nmx:
        trend = TrendReq(hl='en-US', tz=360)
        field = [cat[max_4[0]],cat[max_4[1]],cat[max_4[2]],cat[max_4[3]],cat[i]]
        kw_list = field
        trend.build_payload(kw_list, cat=search_domains['Apparel'], timeframe='today 5-y', geo='', gprop='')
        df = trend.interest_over_time()
        fab_scr.append(df.reset_index())
    return fab_scr

###This functions cleans the dataframe recieved from the above function and seperate out all the attributes and returns a list of series.###
def mk_ser_lst(scr_df):
    scrs = []

    for df in scr_df:
        y = pd.DataFrame()
        y['date'] = df['date']
        y[df.columns[-2]] = df[df.columns[-2]]
        y = y.set_index('date')
        scrs.append(y)

    for i in range(1,5):
        y = pd.DataFrame()
        y['date'] = scr_df[0]['date']
        y[scr_df[0].columns[i]] = scr_df[0][scr_df[0].columns[i]]
        y = y.set_index('date')
        scrs.append(y)
        
    return scrs
    
###Calculates and returns AIC scores for all the possible permutations of SARIMAX parameters in a form of a dictionary, where keys are the AIC scores.###

def all_AIC(scrs):
    aic = {}
    aics = []
    p = d = q = range(0, 2)
    pdq = list(itertools.product(p, d, q))
    seasonal_pdq = [(x[0], x[1], x[2], 25) for x in list(itertools.product(p, d, q))]
    for y in scrs:
        for param in pdq:
            for param_seasonal in seasonal_pdq:
                try:
                    mod = sm.tsa.statespace.SARIMAX(y,order=param,seasonal_order=param_seasonal,enforce_stationarity=False,enforce_invertibility=False)
                    results = mod.fit()
                    #print('ARIMA{}x{}12 - AIC:{}'.format(param,param_seasonal,results.aic))
                    aic[results.aic] = [param,param_seasonal]
                except: 
                    continue
        aics.append(aic)
    return aics

###Fits the model for each attribute using the suitable parameter, that is the parameter with least absolute AIC value and returns a list of results###
def mk_res(aics,scrs):
    results = []
    for i in range(0,len(aics)):
        
        x = list(aics[i][list(aics[i].keys())[np.argmin(np.abs(list(aics[i].keys())))]][1])
        x.remove(25)
        x.append(50)
        x = tuple(x)
        mod = sm.tsa.statespace.SARIMAX(scrs[i],
                                    order=aics[i][list(aics[i].keys())[np.argmin(np.abs(list(aics[i].keys())))]][0],
                                    seasonal_order=x,
                                    enforce_stationarity=False,
                                    enforce_invertibility=False)
        result = mod.fit()
        results.append(result)
    return results

###Computes and returns a list of Dataframe of forecasts for the next two months for each attribute###
def all_fore(results):
    pred_ci = []

    for result in results:
        pred_uc = result.get_forecast(steps=8)
        pred_ci.append(pred_uc.conf_int())
    
    return pred_ci

###Computes and returns a dictionary with keys as attributes and Mean of forecast for next two months scaled upto 100### 
def all_mean_fore(pred_ci):
    for d in pred_ci:
        d['mean'] = (d[d.columns[0]] +d[d.columns[1]])/2
    mfwf = {}
    for d in pred_ci:
        mfwf[d.columns[0].replace(d.columns[0].split()[0]+' ','')] = d['mean'].iloc[:4].mean()
    mxfw = max(mfwf.values())
    for k in mfwf.keys():
        mfwf[k] = mfwf[k]*100/mxfw
    return mfwf

In [4]:
#Dictionary of search domain codes to be used by pytrends.

search_domains = {
    'Fashion & Style': 185,
    'Fashion Designers & Collections': 98,
    'Fashion Modeling': 1155,
    'Shopping': 18,
    'Antiques & Collectibles': 64,
    'Apparel': 68,
    'Apparel Services': 1228,
    'Athletic Apparel': 983,
    'Casual Apparel': 984,
    'T-Shirts': 428,
    "Children's Clothing": 985,
    'Clothing Accessories': 124,
    'Gems & Jewelry': 350,
    'Handbags & Purses': 986,
    'Watches': 987,
    'Costumes': 988,
    'Eyewear': 989,
    'Eyeglasses & Contacts': 1224,
    'Footwear': 697,
    'Formal Wear': 990,
    'Headwear': 991,
    "Men's Clothing": 992,
    'Outerwear': 993,
    'Sleepwear': 994,
    'Swimwear': 995,
    'Undergarments': 530,
    'Uniforms & Workwear': 996,
    "Women's Clothing": 997
}

In [5]:
attr = 'sleeve_length.json'
df = pd.read_json(attr)
cat = df[df.columns[0]]
cat

0    Short sleeve
1    Elbow sleeve
2     Long sleeve
3      3/4 sleeve
4      Sleeveless
Name: Sleeve Length, dtype: object

In [6]:
#using max4 function.
max_4 = max4(cat)
max_4

[0, 2, 3, 4]

In [7]:
# Using tr_scr_nmx_df.
scr_df = tr_scr_nmx_df(cat,max_4)
scr_df


[          date  Short sleeve  Long sleeve  3/4 sleeve  Sleeveless  \
 0   2015-08-09            13           33           3          16   
 1   2015-08-16            13           33           3          14   
 2   2015-08-23            11           38           3          14   
 3   2015-08-30            11           40           3          14   
 4   2015-09-06            10           40           3          13   
 ..         ...           ...          ...         ...         ...   
 256 2020-07-05            22           53           2          20   
 257 2020-07-12            23           56           2          19   
 258 2020-07-19            23           56           2          19   
 259 2020-07-26            22           55           2          19   
 260 2020-08-02            21           54           2          18   
 
      Elbow sleeve isPartial  
 0               0     False  
 1               0     False  
 2               0     False  
 3               0     False  
 4 

In [8]:
#In this a List of Series() is created each containing trend score for the past 5 years for each attribute    
scrs = mk_ser_lst(scr_df)
scrs

[            Elbow sleeve
 date                    
 2015-08-09             0
 2015-08-16             0
 2015-08-23             0
 2015-08-30             0
 2015-09-06             0
 ...                  ...
 2020-07-05             0
 2020-07-12             0
 2020-07-19             0
 2020-07-26             0
 2020-08-02             0
 
 [261 rows x 1 columns],
             Short sleeve
 date                    
 2015-08-09            13
 2015-08-16            13
 2015-08-23            11
 2015-08-30            11
 2015-09-06            10
 ...                  ...
 2020-07-05            22
 2020-07-12            23
 2020-07-19            23
 2020-07-26            22
 2020-08-02            21
 
 [261 rows x 1 columns],
             Long sleeve
 date                   
 2015-08-09           33
 2015-08-16           33
 2015-08-23           38
 2015-08-30           40
 2015-09-06           40
 ...                 ...
 2020-07-05           53
 2020-07-12           56
 2020-07-19         

In [9]:
#This code block is for calculating AIC(Akaike's Information Criterion) for each of the permutation of parameters for each of the attribute. The permutation with the least AIC is deemed best for the model.            
aics = all_AIC(scrs)
aics

[{109.29930934412168: [(0, 0, 0), (0, 0, 0, 25)],
  124.73965565628446: [(0, 0, 0), (0, 0, 1, 25)],
  269.78054203578563: [(0, 0, 0), (0, 1, 0, 25)],
  111.986512458658: [(0, 0, 0), (0, 1, 1, 25)],
  124.25131664584073: [(0, 0, 0), (1, 0, 0, 25)],
  123.21476585542769: [(0, 0, 0), (1, 0, 1, 25)],
  123.59088714675934: [(0, 0, 0), (1, 1, 0, 25)],
  118.09593887750962: [(0, 0, 0), (1, 1, 1, 25)],
  89.03895192246097: [(0, 0, 1), (0, 0, 0, 25)],
  105.82997520649592: [(0, 0, 1), (0, 0, 1, 25)],
  247.23271928252078: [(0, 0, 1), (0, 1, 0, 25)],
  99.827386545871: [(0, 0, 1), (0, 1, 1, 25)],
  105.43719723639896: [(0, 0, 1), (1, 0, 0, 25)],
  105.74396178132764: [(0, 0, 1), (1, 0, 1, 25)],
  115.10759837439264: [(0, 0, 1), (1, 1, 0, 25)],
  108.54839586865968: [(0, 0, 1), (1, 1, 1, 25)],
  178.7018140712056: [(0, 1, 0), (0, 0, 0, 25)],
  187.39827995203336: [(0, 1, 0), (0, 0, 1, 25)],
  322.9403595405334: [(0, 1, 0), (0, 1, 0, 25)],
  182.0525956001946: [(0, 1, 0), (0, 1, 1, 25)],
  187.179

In [10]:
#This code block fits the SARIMAX model on the attribute time series trend score and stores result in the list "results".
results = mk_res(aics,scrs)
results

[<statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2871dd739c8>,
 <statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2871dd51988>,
 <statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2871dd93e08>,
 <statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x2871d713d48>,
 <statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper at 0x28718074548>]

In [86]:
#Storing the forecast in a list.     
pred_ci = all_fore(results)
pred_ci

[            lower Elbow sleeve  upper Elbow sleeve
 2020-08-09           -0.428282            0.513218
 2020-08-16           -0.444709            0.529644
 2020-08-23           -0.460599            0.545534
 2020-08-30           -0.476002            0.560938
 2020-09-06           -0.490961            0.575896
 2020-09-13           -0.505511            0.590447
 2020-09-20           -0.519685            0.604621
 2020-09-27           -0.533510            0.618446,
             lower Short sleeve  upper Short sleeve
 2020-08-09           18.538849           23.634085
 2020-08-16           17.636956           24.535978
 2020-08-23           16.926192           25.246742
 2020-08-30           16.320268           25.852666
 2020-09-06           15.783127           26.389808
 2020-09-13           15.295596           26.877338
 2020-09-20           14.846038           27.326896
 2020-09-27           14.426759           27.746175,
             lower Long sleeve  upper Long sleeve
 2020-08-09 

In [88]:
#Calculating the mean forecast for 2 months from now and normalizing it to a scale of 100.
all_mean_fore(pred_ci)

{'Elbow sleeve': 0.07562839443402884,
 'Short sleeve': 37.55156574591551,
 'Long sleeve': 100.0,
 '3/4 sleeve': 3.5857950520600426,
 'Sleeveless': 32.51127956586197}