In [1]:
import pandas as pd
import numpy as np
import datetime
import pickle

In [2]:
def unifDate(d):
    '''
    This function takes a date string in month/date/Year pattern,
    and returns a date string in Year-month-day pattern.
    '''
    return datetime.datetime.strptime(d, "%m/%d/%Y").strftime("%Y-%m-%d")

def datemod(data):
    '''
    This function takes the whole dataset and converts all order dates into 
    adjusted business days stored in a list. 
    '''
    Datemod = []
    for i in range(len(data)):
        hour, minute, sec = data.orderTime.iloc[i].split(':')
        hour = int(hour)
        if i == 0 or hour > 2:
            Datemod.append(data.orderDate.iloc[i])
        else:
            d = data.orderDate.iloc[i]
            d = datetime.datetime.strptime(d, "%Y-%m-%d")
            d = d- datetime.timedelta(days=1)
            d = d.strftime("%Y-%m-%d")
            Datemod.append(d)
    return Datemod

def week_n(data,fall_18=0,spring_19=0,spring_18=0):
    '''
    This function takes the whole dataset, fall/spring boolean and returns a new column of week_number
    in a fall/spring semester stored in a list.
    '''
    week_num = []
    if fall_18==1:
        start = datetime.datetime.strptime('2018-09-02', "%Y-%m-%d")
    if spring_19==1:
        start = datetime.datetime.strptime('2019-01-13', "%Y-%m-%d")
    if spring_18 == 1:
        start = datetime.datetime.strptime('2018-01-14', "%Y-%m-%d")
    for i in range(len(data)):
        d = data['Datemod'].iloc[i]
        d = datetime.datetime.strptime(d, "%Y-%m-%d")
        week_num.append(int(np.ceil((d - start)/ datetime.timedelta(days=7))))
    return week_num

def week_n_au(data,fall_18=0,spring_19=0,spring_18=0):
    '''
    THis function takes a data frame about daily active user amounts, fall/spring boolean
    and returns the corresponding week_number in the semester that will refer to 
    this value stored in a list.
    '''
    week_num = []
    if fall_18==1:
        start = datetime.datetime.strptime('2018-08-19', "%Y-%m-%d")
    if spring_19==1:
        start = datetime.datetime.strptime('2018-12-30', "%Y-%m-%d")
    if spring_18==1:
        start = datetime.datetime.strptime('2017-12-31', "%Y-%m-%d")
    for i in range(len(data)):
        d = data['formattedDate'].iloc[i]
        d = datetime.datetime.strptime(d, "%Y-%m-%d")
        week_num.append(int(np.ceil((d - start)/ datetime.timedelta(days=7))))
    return week_num

In [3]:
def clean_item_18fall(filename):
    """
    This funciton takes csv filename of a top10 item in 2018 Fall as an argument, and returns an organized dataframe including all features 
    for a top10 item in 2018 Fall.
    """
    data= pd.read_csv(filename,dtype = str,index_col=0)

    item_name = data.itemName.iloc[0]

    data = data[['orderDate', 'orderTime','itemQuantity', 'itemPricePerUnit', 'itemPriceTotal','mealPlanBoolean','circuitName', 'dayOfWeek','discountPercent', 
            'discountPerUnit','discountTotal', 'discountType','deliveryDate']]

    data.orderDate = data.orderDate.apply(unifDate)
    data.mealPlanBoolean = data.mealPlanBoolean.apply(lambda x: 1 if x=='true' else 0)
    data.itemQuantity = data.itemQuantity.apply(int)
    data.itemPricePerUnit = data.itemPricePerUnit.apply(float)

    data['Datemod'] = pd.Series(datemod(data),index=data.index)

    data = data[[ 'itemQuantity', 'itemPricePerUnit',  'circuitName', 'dayOfWeek', 'Datemod']]
    data.set_index('Datemod', drop = False,inplace=True)

    #Monday:0 Sunday:6
    data['dayOfWeek'] = data.Datemod.apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").weekday())
    data.sort_index(inplace=True)
    data = data[data['Datemod']>='2018-09-03']

    data['cycle_n'] = np.where(((data['dayOfWeek'] == 0)|(data['dayOfWeek'] == 1)|(data['dayOfWeek'] == 2)),1,
             np.where(((data['dayOfWeek'] == 3)|(data['dayOfWeek'] == 4)),2,3))

    data['week_n'] = pd.Series(week_n(data,fall_18=1),index=data.index)

    # Remove the circuitName column
    data1=data[data.columns.difference(['circuitName'])]

    data1['Fall/Spring'] = np.where(data1['Datemod'] <= '2019-01-01',1,0)
    
    # Get sales column
    sales = data1.groupby(by = ['week_n', 'cycle_n']).itemQuantity.sum()
    sales = sales.reset_index(level=['week_n', 'cycle_n'])

    df1 = pd.read_csv('Academic_Calander_Fall.csv')
    df1 = df1.dropna()
    df1.deliveryDate = df1.deliveryDate.apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%y").strftime("%Y-%m-%d"))
    df1 = df1.rename(columns = {'deliveryDate':'Datemod','class':'classes'})
    
    data1 = pd.merge(data1,df1, on = 'Datemod', how='left')
    # Get exam days feature
    df_exam = data1.groupby(['week_n','cycle_n']).exam.sum()
    df_exam = pd.DataFrame(df_exam,columns=['exam'])
    df_exam.exam = np.where(df_exam.exam>0,1,0)

    data1.drop_duplicates(subset=['Datemod'], keep='first', inplace = True)
    data1 = data1.reset_index(drop=True)
    
    # Get class days feature
    df_class = data1.groupby(['week_n','cycle_n']).classes.sum()
    df_class = pd.DataFrame(df_class,columns=['classes'])
    # Get sports days feature
    df_sport = data1.groupby(['week_n','cycle_n']).sports.sum()
    df_sport = pd.DataFrame(df_sport,columns=['sports'])
    df_sport.sports = np.where(df_sport.sports>0,1,0)

    data1 = data1[['cycle_n', 'itemPricePerUnit','week_n','Fall/Spring']]
    
    # Get price feature, hasn't put into use yet
    price = data.groupby(by = ['week_n', 'cycle_n']).itemPricePerUnit.min()
    price = price.reset_index(level=['week_n', 'cycle_n'])
    
    # Prepare final data frame
    final = pd.merge(data1, sales, how = 'left', on = ['week_n', 'cycle_n'])
    final.drop_duplicates(subset=['week_n', 'cycle_n'], keep='first', inplace = True)
    final = final.reset_index(drop=True)
    final = final[['week_n','cycle_n','itemPricePerUnit','itemQuantity']]
    final = final.rename(columns = {'itemPricePerUnit':'price','itemQuantity':'sales'})
    
    # Get average daily sales for each cycle
    final.sales = np.where((final['cycle_n']==1),final['sales']/3,final['sales']/2)
    # Get 2-week lag feature
    final['last_2_week_sales'] = final['sales']
    final['last_2_week_sales']=final['last_2_week_sales'].shift(6)

    final = pd.merge(final,df_class,on = ['week_n','cycle_n'], how='left')
    final = pd.merge(final,df_exam,on = ['week_n','cycle_n'], how='left')
    final = pd.merge(final,df_sport,on = ['week_n','cycle_n'], how='left')
    
    main_data = pd.get_dummies(final, prefix='Cycle_', columns=['cycle_n'])

    # Add weekly active users feature
    au = pd.read_csv('DAUs.csv')
    # Preprocessing especially for 18Fall daily active users data
    au = au.iloc[5:]
    au = au.reset_index(drop=True)
    au = au.iloc[:117]

    au['week_n'] = pd.Series(week_n_au(au,fall_18=1),index=au.index)

    au = au.rename(columns={'f0_':'n_users'})

    df_au = au.groupby('week_n').n_users.sum()

    df_au = df_au.reset_index(drop=False)

    df_au['n_users'][13] = 0

    main_data = pd.merge(main_data,df_au,how='left',on='week_n')

    weekly=[]
    for i in range(1,15):
        weekly.append(np.mean(main_data[main_data['week_n']==i].sales))

    avg_over_3_cycles = pd.DataFrame({'week_n':np.arange(1,15)+2,'avg_over_last_3_cycles':weekly})

    main_data = pd.merge(main_data,avg_over_3_cycles, on = 'week_n',how = 'left')

    main_data = main_data.dropna(axis=0)
    # Remove cycles near Thanksgiving week
    main_data = main_data[~((main_data['week_n']==11) & (main_data['Cycle__2']==1))]
    main_data = main_data[~((main_data['week_n']==11) & (main_data['Cycle__3']==1))]
    main_data = main_data[~((main_data['week_n']==12))]
    main_data = main_data[~((main_data['week_n']==13) & (main_data['Cycle__2']==1))]
    main_data = main_data[~((main_data['week_n']==13) & (main_data['Cycle__3']==1))]
    main_data = main_data[~((main_data['week_n']==14))]

    main_data['item_name'] = item_name

    main_data = main_data.drop('price',axis=1)

    return main_data

In [4]:
filenames = [f'top{i}.csv' for i in range(1,11)]
dfs = [clean_item_18fall(filename) for filename in filenames]
main_data = pd.concat(dfs)
main_data1 = main_data.sort_values(by=['week_n'])
main_data1 = main_data1.reset_index(drop=True)

Defaulting to column, but this will raise an ambiguity error in a future version
  


Add an additional sellout feature & interaction with sales 2 week ago

In [7]:
sellout = pd.read_pickle('C:/Users/Baoyp/Documents/2019 Spring/GU project/sellout/sellout_df_fall18.pkl')
sellout = sellout.rename(columns = {'productName':'item_name'})

In [8]:
# We didn't use week14 data in train
main_data1[main_data1['week_n']==14]

Unnamed: 0,week_n,sales,last_2_week_sales,classes,exam,sports,Cycle__1,Cycle__2,Cycle__3,n_users,avg_over_last_3_cycles,item_name


In [10]:
sellout['ref_week_n'] = sellout['week_n']+2
del sellout['week_n']
sellout = sellout.rename(columns={'ref_week_n':'week_n'})

In [13]:
main_data1['cycle_n'] = np.where(main_data1['Cycle__1']==1,1, np.where(main_data1['Cycle__2']==1,2,3))
main_data1 = pd.merge(main_data1,sellout,how='left')
main_data1 = main_data1.drop('cycle_n',axis=1)
main_data1 = main_data1[['week_n', 'sales', 'last_2_week_sales', 'classes', 'exam', 'sports',
       'Cycle__1', 'Cycle__2', 'Cycle__3', 'n_users', 'avg_over_last_3_cycles','75_percent',
       'item_name']]
main_data1 = main_data1.rename(columns = {'75_percent':'sellout'})
main_data1['sellout*2_week_lag'] = main_data1['sellout']*main_data1['last_2_week_sales']
main_data1 = main_data1[['week_n', 'sales', 'last_2_week_sales', 'classes', 'exam', 'sports',
       'Cycle__1', 'Cycle__2', 'Cycle__3', 'n_users', 'avg_over_last_3_cycles','sellout','sellout*2_week_lag',
       'item_name']]

In [16]:
# Save the organized 18Fall data into a pickle
pickle.dump(main_data1,open('final_dataset_18fall.p',"wb" ))

In [7]:
def clean_item_18spring(filename):
    """
    This funciton takes (1) csv filename of 18Spring mega data as an argument, and returns an organized dataframe including all features 
    for all 2018 Spring top10 items.
    """
    df = pd.read_csv(filename,dtype = str,index_col=0)
    # Get data of top 10 items
    df = df[(df['itemName'] == 'Fries')|(df['itemName'] == 'Warm Chocolate Chunk Cookie')|(df['itemName'] == 'Bacon, Egg & Cheese on a Brioche Bun')
    |(df['itemName'] == "Sticky's Chicken Fingers")|(df['itemName'] == 'The Battle of Italy')|(df['itemName'] == 'Bacon, Egg & Cheese on an Ess-a-Bagel')
    |(df['itemName'] == 'Grilled Chicken Sandwich')|(df['itemName'] == 'Baked Penne Alla Vodka')|(df['itemName'] == 'Breakfast Burrito with Bacon, Avocado & Cheddar')
    |(df['itemName'] == "Joe's Pizza")]

    df['dayOfWeek'] = df.Datemod.apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").weekday())
    df.sort_index(inplace=True)
    df = df[df['Datemod']>='2018-01-15']
    df = df[df['Datemod']<='2018-05-11']
    df['cycle_n'] = np.where(((df['dayOfWeek'] == 0)|(df['dayOfWeek'] == 1)|(df['dayOfWeek'] == 2)),1,
             np.where(((df['dayOfWeek'] == 3)|(df['dayOfWeek'] == 4)),2,3))
    df['week_n'] = pd.Series(week_n(df,spring_18=1),index=df.index)

    df = df[['Datemod', 'itemName','Sales',  'class', 'exam', 
           'basketball', 'lacrosse','dayOfWeek', 'cycle_n', 'week_n']]

    df.Sales=df.Sales.apply(float)
    df['class']=df['class'].apply(float)
    df.exam=df.exam.apply(float)
    df.basketball=df.basketball.apply(float)
    df.lacrosse=df.lacrosse.apply(float)
    df['sports']=np.where(((df['basketball']==1)|(df['lacrosse']==1)),1,0)
    
    # Get sales amount for each item in each cycle
    sales = df.groupby(by = ['week_n', 'cycle_n','itemName']).Sales.sum()
    sales = sales.reset_index(level=['week_n', 'cycle_n','itemName'])
    # Get exam days feature
    df_exam = df.groupby(['week_n','cycle_n','itemName']).exam.sum()
    df_exam = pd.DataFrame(df_exam,columns=['exam'])
    df_exam.exam = np.where(df_exam.exam>0,1,0)
    # Get class days feature
    df_class = df.groupby(['week_n','cycle_n','itemName'])['class'].sum()/33 #33: total num of droppoints
    df_class = pd.DataFrame(df_class,columns=['class'])
    # Get sports days feature
    df_sport = df.groupby(['week_n','cycle_n','itemName']).sports.sum()
    df_sport = pd.DataFrame(df_sport,columns=['sports'])
    df_sport.sports = np.where(df_sport.sports>0,1,0)

    df = df.drop(['basketball','lacrosse','Datemod'],axis=1)
    df.drop_duplicates(subset=['week_n', 'cycle_n','itemName'], keep='first', inplace = True)
    df = df.reset_index(drop=True)

    df = pd.merge(df, sales, how = 'left', on = ['week_n', 'cycle_n','itemName'])

    df = df.drop('Sales_x',axis=1)
    df = df.rename({'Sales_y':'sales'},axis=1)
    # Get average daily sales for each cycle
    df.sales = np.where((df['cycle_n']==1),df['sales']/3,df['sales']/2)
    # Remove week 9: spring break week
    df = df[~(df['week_n']==9)]
    # Get 2-week lag feature
    df['last_2_week_sales'] = df['sales']
    df['last_2_week_sales']=df['last_2_week_sales'].shift(60)

    df = pd.get_dummies(df, prefix='Cycle_', columns=['cycle_n'])
    # Add weekly active users feature
    au = pd.read_csv('DAUs_Spring18.csv')
    au['week_n'] = pd.Series(week_n_au(au,spring_18=1),index=au.index)
    au = au.rename(columns={'f0_':'n_users'})
    
    df_au = au.groupby('week_n').n_users.sum()
    df_au = df_au.reset_index(drop=False)
    df = pd.merge(df,df_au,how='left',on='week_n')
    # Get average sales over last three cycles, saved in 'test' dataframe. 'test' data frame is adjusted due to week9 is spring break and operation stops in that week.
    df_weekly = df.groupby(by=['week_n','itemName']).sales.mean()
    test = pd.concat([df_weekly.shift(20).loc[:8],df_weekly.shift(10).loc[10:10],df_weekly.shift(20).loc[11:17]])
    test=test.reset_index()
    test=test.rename(columns={'sales':'avg_over_3_cycles'})

    df = pd.merge(df,test, on = ['week_n','itemName'],how = 'left')
    df= df.rename(columns={'class':'classes','itemName':'item_name','avg_over_3_cycles':'avg_over_last_3_cycles'})
    df=df[['week_n', 'sales', 'last_2_week_sales', 'classes', 'exam', 'sports',
           'Cycle__1', 'Cycle__2', 'Cycle__3', 'n_users', 'avg_over_last_3_cycles',
           'item_name']]

    return df

In [8]:
df=clean_item_18spring('megadata_18spring.csv')

In [48]:
# Save the organized 18Fall data into a pickle
pickle.dump(df,open('final_dataset_18spring.p',"wb" ))

In [31]:
discount = pd.read_csv('Daily Deals Schedule  - Daily Deal - Syracuse.csv')
discount = discount[:49]
discount.Date = discount.Date.apply(unifDate)
#df [Date,items]
discount.index = discount.Date
df = discount['Item 1'].append(discount['Item 2'])
df.sort_index(inplace=True)
df.dropna(axis=0,inplace=True)
df = df.reset_index()
df = df.rename(columns ={0:'item_name'})
df.replace(to_replace='Grilled Chicken Tenders', value='Grilled Tenders & Dipping Sauce',inplace=True)
df.to_pickle('discount.p')

In [3]:
def clean_item_19spring(filename,enddate,pred_enddate,dau_filename):
    """
    This funciton takes (1) csv filename of a top10 item (2) the cutoff date of order records (3) the last date of our prediction 
    (4) csv filename of daily active users upto the cutoff date as arguments, and returns an organized dataframe 
    including all features for a top10 item.
    """
    data= pd.read_csv(filename,dtype = str,index_col=0)
    item_name = data.itemName.iloc[0]
    data = data[['orderDate', 'orderTime','itemQuantity', 'itemPricePerUnit', 'itemPriceTotal','mealPlanBoolean','circuitName', 'dayOfWeek','discountPercent', 
            'discountPerUnit','discountTotal', 'discountType','deliveryDate']]
    data.orderDate = data.orderDate.apply(unifDate)
    data.mealPlanBoolean = data.mealPlanBoolean.apply(lambda x: 1 if x=='true' else 0)
    data.itemQuantity = data.itemQuantity.apply(int)
    data.itemPricePerUnit = data.itemPricePerUnit.apply(float)
    data = data.loc[:enddate]

    data['Datemod'] = pd.Series(datemod(data),index=data.index)

    data = data[[ 'itemQuantity', 'itemPricePerUnit',  'circuitName', 'dayOfWeek', 'Datemod']]
    data.set_index('Datemod', drop = False,inplace=True)

    daily_sales = data.groupby(by = 'Datemod').itemQuantity.sum()

    new_index = pd.date_range(start='2019-01-14', end=pred_enddate, freq='D')
    new_index=new_index.astype(str)

    data.drop_duplicates(subset=['Datemod'], keep='first', inplace = True)
    data = data.reset_index(drop=True)
    data.index=data.Datemod
    data = data.reindex(new_index)
    data.Datemod = data.index

    #Monday: 0 Sunday:6
    data['dayOfWeek'] = data.Datemod.apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").weekday())
    data = data[data['Datemod']>='2019-01-14']

    # Add cycle_number feature
    data['cycle_n'] = np.where(((data['dayOfWeek'] == 0)|(data['dayOfWeek'] == 1)|(data['dayOfWeek'] == 2)),1,
             np.where(((data['dayOfWeek'] == 3)|(data['dayOfWeek'] == 4)),2,3))
    # Add actual 19spring week number
    data['week_n'] = pd.Series(week_n(data,spring_19=1),index=data.index)

    df_sales=pd.DataFrame(daily_sales)
    df_sales = df_sales.rename(columns={'itemQuantity':'sales'})

    data = pd.merge(data,df_sales,how='left',left_index=True,right_index=True)
    data1=data[data.columns.difference(['circuitName','itemQuantity'])]
    
    # Adjust daily sales according to discount situation
    perc = pd.read_pickle("C:/Users/Baoyp/Documents/2019 Spring/GU project/premodel/premodelDF.pkl")
    discount = pd.read_pickle("C:/Users/Baoyp/Documents/2019 Spring/GU project/discount.p")
    discount['discount'] = 1
    discount_item=discount[discount['item_name']==item_name][['Date','discount']]

    data1 = data1.merge(discount_item,how='left',left_on='Datemod',right_on='Date')
    del data1['Date']
    data1.fillna(0,inplace=True)
    perc_item=perc[perc['name']==item_name][['weekday','factor']]
    data1 = data1.merge(perc_item,how='left',left_on='dayOfWeek',right_on='weekday')
    del data1['weekday']
    weekly_sales=data1.groupby('week_n').sales.sum()
    data1.discount = data1.discount.apply(int)
    idx = data1.index[data1['discount']]==1
    if len(data1.loc[idx]) == 0:
        pass
    elif len(data1.loc[idx]) == 1:
        data1.loc[idx,'sales']=data1.loc[idx]['factor']*weekly_sales.loc[data1.loc[idx]['week_n']]
    elif len(data1.loc[idx]) >1:
        data1.loc[idx,'sales']=data1.loc[idx]['factor'].values*weekly_sales.loc[data1.loc[idx]['week_n']].values

    # Get sales data
    sales = data1.groupby(by = ['week_n', 'cycle_n']).sales.sum()
    sales = sales.reset_index(level=['week_n', 'cycle_n'])
    df1 = pd.read_csv('Academic_Calander_Spring_2019.csv')
    df1 = df1.iloc[:110]
    df1.deliveryDate = df1.deliveryDate.apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%y").strftime("%Y-%m-%d"))
    df1 = df1.rename(columns = {'deliveryDate':'Datemod','class':'classes'})
    data1 = pd.merge(data1,df1, on = 'Datemod', how='left')

    # Get exam days feature
    df_exam = data1.groupby(['week_n','cycle_n']).exam.sum()
    df_exam = pd.DataFrame(df_exam,columns=['exam'])
    df_exam.exam = np.where(df_exam.exam>0,1,0)

    # Get class days feature
    df_class = data1.groupby(['week_n','cycle_n']).classes.sum()
    df_class = pd.DataFrame(df_class,columns=['classes'])
    # Get sports days feature
    df_sport = data1.groupby(['week_n','cycle_n']).sports.sum()
    df_sport = pd.DataFrame(df_sport,columns=['sports'])
    df_sport.sports = np.where(df_sport.sports>0,1,0)

    data1 = data1[['cycle_n', 'itemPricePerUnit','week_n']]
    # Get price feature, it hasn't been used yet
    price = data.groupby(by = ['week_n', 'cycle_n']).itemPricePerUnit.min()
    price = price.reset_index(level=['week_n', 'cycle_n'])

    final = pd.merge(data1, sales, how = 'left', on = ['week_n', 'cycle_n'])
    final.drop_duplicates(subset=['week_n', 'cycle_n'], keep='first', inplace = True)
    final = final.reset_index(drop=True)
    final = final[['week_n','cycle_n','itemPricePerUnit','sales']]
    final = final.rename(columns = {'itemPricePerUnit':'price'})
    final.sales = np.where((final['cycle_n']==1),final['sales']/3,final['sales']/2)
    # Get 2-week lag feature
    final['last_2_week_sales'] = final['sales']
    final['last_2_week_sales']=final['last_2_week_sales'].shift(6)

    final = pd.merge(final,df_class,on = ['week_n','cycle_n'], how='left')
    final = pd.merge(final,df_exam,on = ['week_n','cycle_n'], how='left')
    final = pd.merge(final,df_sport,on = ['week_n','cycle_n'], how='left')

    main_data = pd.get_dummies(final, prefix='Cycle_', columns=['cycle_n'])
    # Get weekly active users feature
    au = pd.read_csv(dau_filename)
    au = au.iloc[9:]
    au = au.reset_index(drop=True)
    au['week_n'] = pd.Series(week_n_au(au,spring_19=1),index=au.index)
    au = au.rename(columns={'f0_':'n_users'})
    df_au = au.groupby('week_n').n_users.sum()
    df_au = df_au.reset_index(drop=False)

    main_data = pd.merge(main_data,df_au,how='left',on='week_n')
    # Get average sales over last three cycles feature
    df_weekly = main_data.groupby(by=['week_n']).sales.mean()
    avg_over_3_cycles = df_weekly.shift(2)
    avg_over_3_cycles=avg_over_3_cycles.reset_index()
    avg_over_3_cycles=avg_over_3_cycles.rename(columns={'sales':'avg_over_last_3_cycles'})

    main_data = pd.merge(main_data,avg_over_3_cycles[['week_n','avg_over_last_3_cycles']], on = 'week_n',how = 'left')
    main_data = main_data.drop('price',axis=1)
    main_data['item_name'] = item_name
    main_data = main_data.dropna()

    # Add sellout feature
    sellout = pd.read_pickle('C:/Users/Baoyp/Documents/2019 Spring/GU project/sellout/sellout_df_spring19.pkl')
    sellout = sellout[(sellout['week_n']!=0)&(sellout['productName']==item_name)]
    sellout['ref_week_n'] = sellout['week_n']+2
    del sellout['week_n']
    sellout = sellout.rename(columns={'ref_week_n':'week_n'})
    del sellout['sellout']
    sellout = sellout.rename(columns={'productName':'item_name'})
    main_data['cycle_n'] = np.where(main_data['Cycle__1']==1,1, np.where(main_data['Cycle__2']==1,2,3))
    main_data = pd.merge(main_data,sellout,how='left')
    main_data= main_data.rename(columns = {'75_percent':'sellout'})
    main_data['sellout*2_week_lag']=main_data['last_2_week_sales']*main_data['sellout']
    main_data = main_data[['week_n', 'sales', 'last_2_week_sales', 'classes', 'exam', 'sports',
           'Cycle__1', 'Cycle__2', 'Cycle__3', 'n_users', 'avg_over_last_3_cycles','sellout','sellout*2_week_lag','cycle_n','item_name']]
    
    return main_data

In [4]:
# filename='C:/Users/Baoyp/Documents/2019 Spring/GU project/by2.17/top1.csv'
# enddate = '2019-02-17'
# dau_filename = 'DAU_for_week7.csv'
enddate = '2019-02-24'
pred_enddate = '2019-03-10'
dau_filename = 'C:/Users/Baoyp/Documents/2019 Spring/GU project/by2.24/DAU_for_week8.csv'
filenames = [f'C:/Users/Baoyp/Documents/2019 Spring/GU project/by2.24/top{i}.csv' for i in range(1,11)]
dfs = [clean_item_19spring(filename,enddate,pred_enddate,dau_filename) for filename in filenames]
main_data = pd.concat(dfs)
main_data1 = main_data.sort_values(by=['week_n'])

Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version
Defaulting to column, but this will raise an ambiguity error in a future version


In [5]:
pickle.dump(main_data1,open('final_dataset_19spring_week8.p',"wb" ))

In [6]:
spring = pd.read_pickle('final_dataset_19spring_week8.p')
spring

Unnamed: 0,week_n,sales,last_2_week_sales,classes,exam,sports,Cycle__1,Cycle__2,Cycle__3,n_users,avg_over_last_3_cycles,sellout,sellout*2_week_lag,cycle_n,item_name
0,3,32.000000,21.640000,3.0,0,0,1,0,0,3931.0,24.213333,0,0.00,1,Penne alla Vodka
2,3,22.000000,5.000000,2.0,0,0,0,0,1,3931.0,7.932222,0,0.00,3,BBQ Pulled Pork Plate
1,3,11.000000,10.000000,2.0,0,0,0,1,0,3931.0,7.932222,0,0.00,2,BBQ Pulled Pork Plate
0,3,12.180000,8.796667,3.0,0,0,1,0,0,3931.0,7.932222,0,0.00,1,BBQ Pulled Pork Plate
1,3,9.000000,14.500000,2.0,0,0,0,1,0,3931.0,18.904444,0,0.00,2,Roasted Chicken Plate
2,3,21.500000,11.500000,2.0,0,0,0,0,1,3931.0,12.500000,0,0.00,3,Mexican Burrito Bowl
1,3,15.500000,11.000000,2.0,0,0,0,1,0,3931.0,12.500000,0,0.00,2,Mexican Burrito Bowl
0,3,14.666667,15.000000,3.0,0,0,1,0,0,3931.0,12.500000,0,0.00,1,Mexican Burrito Bowl
2,3,26.500000,25.500000,2.0,0,0,0,0,1,3931.0,17.666667,0,0.00,3,Mac and Cheese
1,3,5.000000,8.000000,2.0,0,0,0,1,0,3931.0,10.888889,0,0.00,2,Pad Thai with Chicken


In [53]:
spring.item_name.unique()

array(['Penne alla Vodka', 'BBQ Pulled Pork Plate',
       'Roasted Chicken Plate', 'Mexican Burrito Bowl', 'Mac and Cheese',
       'Pad Thai with Chicken', 'Grilled Tenders & Dipping Sauce',
       'Chicken Parm Pasta', 'Build-Your-Own Chicken Tacos',
       'Buffalo Chicken Dip'], dtype=object)