In [1]:
import pandas as pd
import numpy as np
from datetime import date
from datetime import datetime
from collections import Counter
from numpy import loadtxt

In [2]:

def _map_to_pandas(rdds):
    """ Needs to be here due to pickling issues """
    return [pd.DataFrame(list(rdds))]

def toPandas(df, n_partitions=None):
    """
    Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
    repartitioned if `n_partitions` is passed.
    :param df:              pyspark.sql.DataFrame
    :param n_partitions:    int or None
    :return:                pandas.DataFrame
    """
    if n_partitions is not None: df = df.repartition(n_partitions)
    df_pand = df.rdd.mapPartitions(_map_to_pandas).collect()
    df_pand = pd.concat(df_pand)
    df_pand.columns = df.columns
    return df_pand

In [3]:
dbutils.widgets.text("futuredata", "allFutureFlights", "Future data")

futuredate = dbutils.widgets.get("futuredata")
print(futuredate)
global_temp_db = spark.conf.get("spark.sql.globalTempDatabase")
## Extract pax and flight data from db table: spark_pax_flight_future_final

future_pax_data = spark.sql("SELECT * from {0}".format(global_temp_db + "." + futuredate))   
#print(type(spark_future_pax_data)): spark_future_pax_data is a dataframe
spark_future_pax_data = toPandas(future_pax_data,5)

In [4]:

spark_future_pax_data.rename(columns = {'board_point':'flight_boarding_pt', 
                                        'menu_name': 'menuname'}, inplace = True)
spark_future_pax_data['flight_boarding_time'] =  spark_future_pax_data['flight_boarding_time'].apply(lambda x : datetime.strptime(x, '%Y%m%d%H%M'))

In [5]:
spark_future_pax_data.dishsubcategory[spark_future_pax_data.menucardname == 'Cajun chicken'] = 'Poultry'

#M1 = spark_future_pax_data[((spark_future_pax_data.meal_service_name == 'Hot Meal') | ((spark_future_pax_data.flight_number == 306) & (spark_future_pax_data.meal_service_name == 'Hot Breakfast'))) & (spark_future_pax_data.dishcategory == 'Main Course')]
M1 = spark_future_pax_data[(spark_future_pax_data.meal_service_name == 'Hot Meal') & (spark_future_pax_data.dishcategory == 'Main Course')]


In [6]:
cuisine1 = sqlContext.read.format('csv').options(header='true', inferSchema='true', dateFormat='yyyyMMddHHmm').load('/FileStore/tables/Dish_Cuisine_Sandra.csv')
cuisine = toPandas(cuisine1,5)
cuisine.Cuisine = cuisine.Cuisine.replace('\?','',regex=True)
cuisine.itemname = cuisine.itemname.str.lower()
cuisine.itemname = cuisine.itemname.str.strip()

cuisine.Cuisine = cuisine.Cuisine.str.lower()
cuisine.Cuisine = cuisine.Cuisine.str.strip()

In [7]:
M1.menucardname = M1.menucardname.str.lower()
M1.menucardname = M1.menucardname.str.strip()

M1 = pd.merge(M1,cuisine,left_on = 'menucardname', right_on = 'itemname', how = 'left')

# Menu cycle, Destination
M1['menuname'][M1['menuname'] == 'F DXBAUS HM J Q A'] = 'DXBAUS HM J Q A'
M1['menuname'][M1['menuname'] == 'FEST2017 DXBEUR HMJ'] = 'DXBEUR HMJ FEST2017'
M1['menuname'][M1['menuname'] == 'FEST2017 DXBGER HMJ'] = 'DXBGER HMJ FEST2017'
M1['menuname'][M1['menuname'] == 'HO 2017 DXBCDG HM JB'] = 'DXBCDG HM JB'
M1['menuname'][M1['menuname'] == 'TR DXBMEL HM J T2'] = 'DXBMEL HM J T'

M1['menu_cycle'] = M1['menuname'].str.split().str[-1]
M1['menu_cycle'][M1['menu_cycle'] == 'JA'] = 'A'
M1['menu_cycle'][M1['menu_cycle'] == 'JB'] = 'B'
M1['menu_cycle'][M1['menu_cycle'] == 'FEST17'] = 'FEST2017'

M1['destination'] = M1['menuname'].str.split().str[0].str[3:]

# Age group
M1['date_of_birth'] = pd.to_datetime(M1['date_of_birth'])

M1['today_date'] = date.today()
M1['today_date'] = pd.to_datetime(M1['today_date'])

M1['age'] = (M1['today_date']-M1['date_of_birth'])/np.timedelta64(1, 'Y')

bins = [0, 12, 19, 40, 60, 100]
M1['age_groups'] = pd.cut(M1['age'], bins)

M1['age_group'] = M1['age_groups'].cat.codes

M1['age_group_1'] = np.nan 
M1['age_group_1'][M1['age_group'] == 4] = 'Elders' 
M1['age_group_1'][M1['age_group'] == 3] = 'Middle Aged' 
M1['age_group_1'][M1['age_group'] == 2] = 'Adults' 
M1['age_group_1'][M1['age_group'] == 1] = 'Teenagers'
M1['age_group_1'][M1['age_group'] == 0] = 'Children'

In [8]:
country_codes = sqlContext.read.format('csv').options(header='true', inferSchema='true', dateFormat='yyyyMMddHHmm').load('/FileStore/tables/country_codes_cuisine.csv')
country_codes.createOrReplaceTempView("country_codes")
country_codes = spark.sql("select `alpha-2` as alpha_2,country_region as country_region from country_codes")   #no nulls in columns, only empty strings 249
country_regions = toPandas(country_codes,5)

In [9]:
M1 = pd.merge(M1,country_regions, left_on = 'nationality', right_on = 'alpha_2', how = 'left')

M1['values'] = 1

In [10]:
temp1_dishsub_cuisine = pd.get_dummies(M1[['flight_number', 'flight_boarding_pt', 
                           'flight_boarding_time', 'dishsubcategory','Cuisine']].drop_duplicates(),
columns = ['dishsubcategory','Cuisine'])

#.groupby(['flight_number', 'flight_boarding_pt', 
#                           'flight_boarding_time']).sum().reset_index()


# demographics - flight level aggregations

pl = ['flight_number', 'flight_boarding_pt', 'flight_boarding_time','destination', 
      'menu_cycle', 'service_category_code']

columns_list = ['gender', 'country_region']

temp1_demographics = pd.pivot_table(M1[['flight_number', 'flight_boarding_pt', 
                           'flight_boarding_time','pax_id','menu_cycle','destination', 'service_category_code',
                           'age_group_1','gender', 'country_region','values']].drop_duplicates(), 
index = pl, columns = 'age_group_1',
                       values = 'values',  
                       aggfunc=np.sum).reset_index()

In [11]:
for i in columns_list:
    temp1_demo = pd.pivot_table(M1[['flight_number', 'flight_boarding_pt', 
                               'flight_boarding_time','pax_id','menu_cycle','destination', 'service_category_code',
                               'age_group_1','gender', 'country_region','values']].drop_duplicates(), 
    index = pl, columns = i,
                           values = 'values',  
                           aggfunc=np.sum).reset_index()
    temp1_demographics = pd.concat([temp1_demographics, temp1_demo.drop(pl,axis=1)], axis=1)
    
temp1 = pd.merge(temp1_demographics,temp1_dishsub_cuisine, on = ['flight_number', 'flight_boarding_pt', 
                               'flight_boarding_time'])


In [12]:
# extract basic features from date
temp1['year'] = temp1['flight_boarding_time'].dt.year
temp1['month'] = temp1['flight_boarding_time'].dt.month
temp1['quarter'] = temp1['flight_boarding_time'].dt.quarter
temp1['week'] = temp1['flight_boarding_time'].dt.week
temp1['day'] = temp1['flight_boarding_time'].dt.day
temp1['dayofweek'] = temp1['flight_boarding_time'].dt.dayofweek

temp1.columns = [w.replace('dishsubcategory_','') for w in temp1.columns]
temp1.columns = [w.replace('Cuisine_','') for w in temp1.columns]

# wide to long
temp1.rename(columns = {'Poultry':'Meal_Poultry',
       'Red Meat':'Meal_Red Meat', 'Seafood':'Meal_Seafood', 'Pasta or Vegetarian': 'Meal_Pasta or Vegetarian'}, inplace = True)

In [13]:
list_melt = temp1.columns.tolist()
list_melt = [e for e in list_melt if e not in ('Meal_Others',
       'Meal_Poultry', 'Meal_Red Meat', 'Meal_Seafood','Meal_Pasta or Vegetarian')]


df1 = (pd.melt(temp1,id_vars = list_melt, value_name='Meal'))

In [14]:


df1[['tmp','cat']] = df1.variable.str.split('_', expand=True)

df1 = df1.drop(['variable', 'tmp'],axis=1).sort_values(['flight_number', 'flight_boarding_pt', 'flight_boarding_time'])

df1 = df1[df1.Meal == 1]


pax_count = M1[['flight_number', 'flight_boarding_pt', 
                           'flight_boarding_time','pax_id']].groupby(['flight_number', 'flight_boarding_pt', 
                           'flight_boarding_time']).agg({'pax_id':'nunique'}).reset_index()
pax_count.columns = ['flight_number', 'flight_boarding_pt', 'flight_boarding_time', 'pax_count']
#pax_count.pax_count = pax_count.pax_count/3


df1 = pd.merge(df1,pax_count,on=['flight_number', 'flight_boarding_pt', 'flight_boarding_time'])

df1.rename(columns = {'cat':'dishsubcategory','service_category_code' : 'itemcategory'}, inplace = True)

df1.itemcategory = df1.itemcategory.replace({'L':'Lunch','D':'Dinner'})

In [15]:
spark_df = sqlContext.createDataFrame(df1)
tableName = "meals_future_{0}".format(str(int(datetime.now().timestamp())))
spark_df.createOrReplaceGlobalTempView(tableName)


In [16]:
dbutils.notebook.exit(tableName)

In [17]:
df1