In [1]:
## Importing libraries

import pandas as pd
import numpy as np
from datetime import date
from collections import Counter
from numpy import loadtxt
from pyspark.sql import SparkSession 

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
550,application_1532233567143_0320,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
## Create Spark Session

spark = SparkSession.builder.appName("First SparkSession").getOrCreate()


In [3]:
## Extract pax and flight data from db table: spark_pax_flight_future_final

spark_future_pax_data = spark.sql("select * from spark_pax_flight_future_final_for_pyspark_test ")   
#print(type(spark_future_pax_data)): spark_future_pax_data is a dataframe


In [4]:
# Rename Columns in spark_future_pax_data

spark_future_pax_data = spark_future_pax_data.withColumnRenamed("board_point", "flight_boarding_pt")
spark_future_pax_data = spark_future_pax_data.withColumnRenamed("menu_name", "menuname")

#print(spark_future_pax_data.describe)


In [5]:
# dishsubcategory column has empty strings, replace those values with "Poultry"

from pyspark.sql.functions import *

spark_future_pax_data = spark_future_pax_data.withColumn("dishsubcategory", \
              when(spark_future_pax_data["menucardname"] == "Cajun chicken", "Poultry").otherwise(spark_future_pax_data["dishsubcategory"]))

#spark_future_pax_data.select('dishsubcategory').distinct().show()

In [6]:
## Selecting all Hot Meals from the Main Course into M1 dataframe

M1 = spark_future_pax_data.where((col('meal_service_name') == 'Hot Meal') & (col('dishcategory') == 'Main Course')) 
#M1.count()
#M1.show()

#M1.select('dishsubcategory').distinct().show()

In [7]:
# Extract Cuisine data

cuisine = spark.sql("select trim(lower(itemname)) as itemname,trim(lower(cuisine)) as cuisine from  dish_cuisine_sandra_from_banus_local_for_pyspark_test ")  
cuisine = cuisine.drop_duplicates()
cuisine = cuisine.withColumn("Cuisine", regexp_replace('cuisine', '\?', ''))
#cuisine = cuisine.withColumn("itemname", regexp_replace('itemname', '\?', ''))
#cuisine.show()

In [8]:
print("cuisine count")
cuisine.count()

cuisine count
241

In [9]:
columns = ['itemname', 'cuisine']
vals = [
    ('beef ragoût','continental / european'),
    ('boiled beef with apple and horseradish purée','german'),
    ('grilled beef fillet with béarnaise sauce','continental / european'),
    ('lamb ragoût','continental / european'),
    ('seafood with chive velouté','continental / european'),
    ('seafood á l\'armoricaine','continental / european'),
    ('seafood à l\'armoricaine','continental / european'),
]

# create DataFrame
cuisine_specialCharacters = spark.createDataFrame(vals, columns)
cuisine_wspecialCharacters = cuisine.union(cuisine_specialCharacters)

In [10]:
M1 = M1.withColumn("menucardname", lower(col("menucardname")))
M1 = M1.withColumn("menucardname", trim(M1.menucardname))

In [11]:
print("M1 before joining with cuisine")
M1.count()

M1 before joining with cuisine
49728

In [12]:
## Join M1 and Cuisine dataframes 

M1 = M1.join(cuisine_wspecialCharacters, M1.menucardname==cuisine.itemname, how='left')
#M1.select('menuname').distinct().show()


In [13]:
print("M1 after joining with cuisine")
M1.count() #-- 49728
#cuisine.count() -239

M1 after joining with cuisine
49728

In [14]:

M1 = M1.withColumn("menuname", \
                    when(M1["menuname"] == "F DXBAUS HM J Q A", "DXBAUS HM J Q A").otherwise(M1.menuname))

M1 = M1.withColumn("menuname", \
                   when(M1["menuname"] == "FEST2017 DXBEUR HMJ","DXBEUR HMJ FEST2017").otherwise(M1.menuname))

M1 = M1.withColumn("menuname", \
                   when(M1["menuname"] == "FEST2017 DXBGER HMJ","DXBGER HMJ FEST2017").otherwise(M1.menuname))

M1 = M1.withColumn("menuname", \
                   when(M1["menuname"] == "HO 2017 DXBCDG HM JB","DXBCDG HM JB").otherwise(M1.menuname))

M1 = M1.withColumn("menuname", \
                   when(M1["menuname"] == "TR DXBMEL HM J T2", "DXBMEL HM J T").otherwise(M1.menuname))

#M1.select('menuname').distinct().show()

In [15]:

M1 = M1.withColumn('menu_cycle', split(reverse(M1.menuname), ' ')[0])

#M1.select('menu_cycle').distinct().show()

In [16]:
M1.select('dishsubcategory').distinct().show()

+-------------------+
|    dishsubcategory|
+-------------------+
|            Poultry|
|           Red Meat|
|            Seafood|
|Pasta or Vegetarian|
+-------------------+

In [17]:


M1 = M1.withColumn("menu_cycle", \
                   when(M1["menu_cycle"] == "JA", "A").otherwise(M1.menu_cycle))
M1 = M1.withColumn("menu_cycle", \
                   when(M1["menu_cycle"] == "JB", "B").otherwise(M1.menu_cycle))
M1 = M1.withColumn("menu_cycle", \
                   when(M1["menu_cycle"] == "FEST17", "FEST2017").otherwise(M1.menu_cycle))

M1 = M1.withColumn('destination', substring(split(M1.menuname, ' ')[0],4,3))

#M1.select('destination','menuname').distinct().show()

In [18]:
# Pax ages to be 'bin'ed

from pyspark.sql.types import *
from pyspark.ml.feature import Bucketizer

M1 = M1.withColumn('date_of_birth',to_date(M1.date_of_birth))
M1 = M1.withColumn('today_date',current_date())
#M1 = M1.withColumn('age1',floor(datediff(M1.today_date,M1.date_of_birth)/365).cast(DoubleType()))
#M1 = M1.withColumn('age',(datediff(M1.today_date,M1.date_of_birth)/365).cast(DoubleType()))
M1 = M1.withColumn('age',(datediff(M1.today_date,M1.date_of_birth)/365.2425).cast(DoubleType()))

#M1.select('age').distinct().show()

M1=M1.withColumn('age',when(M1.age.isNull(),-1).otherwise(M1.age))

bucketizer = Bucketizer(splits=[float("-inf"),0, 12, 19, 40, 60,100, float("inf")],inputCol="age", outputCol="age_group")
M1 = bucketizer.setHandleInvalid("keep").transform(M1)




In [19]:
t = {0.0:"Unknown",
     1.0:"Children", 
     2.0:"Teenagers", 
     3.0:"Adults", 
     4.0:"Middle Aged", 
     5.0:"Elders",
     6.0:"Unknown_agegroup"}
bucket_name_ = udf(lambda x: t[x], StringType())
M1 = M1.withColumn("age_group_1", bucket_name_("age_group"))
M1.groupby([M1.age_group,M1.age_group_1]).count().show()


+---------+-----------+-----+
|age_group|age_group_1|count|
+---------+-----------+-----+
|      5.0|     Elders| 7941|
|      3.0|     Adults|12379|
|      1.0|   Children| 2307|
|      4.0|Middle Aged|21057|
|      2.0|  Teenagers| 1690|
|      0.0|    Unknown| 4354|
+---------+-----------+-----+

In [20]:
# Extract Country data
##!!!!!!!!!!!!!!!!!!!!!!!!! disable
M1=M1.withColumn('nationality',psf.when(M1.nationality == 'NA',"").otherwise(M1.nationality))
country_codes = spark.sql("select `alpha-2` as alpha_2,country_region as country_region from country_codes_cuisine")   #no nulls in columns, only empty strings

M1 = M1.join(country_codes, M1.nationality==country_codes.alpha_2, how='left')


In [21]:
print("M1 after joining with country codes")
M1.count()

M1 after joining with country codes
49728

In [22]:
# Perform flight level aggregations - dishsubcategory

#drop dups
temp1_dishsub_cuisine = M1.select('flight_number', 'flight_boarding_pt','flight_boarding_time','dishsubcategory','Cuisine').drop_duplicates()

#the only column with NaNs is Cuisine. Replace those with 'Unknown' as the one-hot code in pyspark doesn't allow usage of columns with Nan
#temp1_dishsub_cuisine.where(col("Cuisine").isNull()).count() -- 530
temp1_dishsub_cuisine=temp1_dishsub_cuisine.na.fill({'Cuisine':'Unknown Cuisine'})
 
# below function to create as many values in the list as the distinct values in 'Cuisine' and 'dishsubcategory'
cuisine_categories = temp1_dishsub_cuisine.select('Cuisine').distinct().rdd.flatMap(lambda x : x).collect()
dishsubcatgories = temp1_dishsub_cuisine.select('dishsubcategory').distinct().rdd.flatMap(lambda x : x).collect()
cuisine_categories.sort()
dishsubcatgories.sort()

# one-hot encoding 
for category in dishsubcatgories:
    function = udf(lambda item: 1 if item == category else 0, IntegerType())
    new_column_name = 'dishsubcategory'+'_'+category
    temp1_dishsub_cuisine = temp1_dishsub_cuisine.withColumn(new_column_name, function(col('dishsubcategory')))

for category in cuisine_categories:
    function = udf(lambda item: 1 if item == category else 0, IntegerType())
    new_column_name = 'Cuisine'+'_'+category
    temp1_dishsub_cuisine = temp1_dishsub_cuisine.withColumn(new_column_name, function(col('Cuisine')))


In [23]:
dishsubcatgories

['Pasta or Vegetarian', 'Poultry', 'Red Meat', 'Seafood']

In [24]:
print("temp1_dishsub_cuisine after one hot for dishsubcategory and cuisine")
temp1_dishsub_cuisine.count()

temp1_dishsub_cuisine after one hot for dishsubcategory and cuisine
678

In [25]:
# dropping 'Cuisine', 'dishsubcategory' columns
drop_list = ['Cuisine', 'dishsubcategory']

temp1_dishsub_cuisine=temp1_dishsub_cuisine.select([column for column in temp1_dishsub_cuisine.columns if column not in drop_list])
temp1_dishsub_cuisine.columns

['flight_number', 'flight_boarding_pt', 'flight_boarding_time', 'dishsubcategory_Pasta or Vegetarian', 'dishsubcategory_Poultry', 'dishsubcategory_Red Meat', 'dishsubcategory_Seafood', 'Cuisine_Unknown Cuisine', 'Cuisine_asian', 'Cuisine_asian / japanese', 'Cuisine_continental / european', 'Cuisine_indian', 'Cuisine_middle eastern', 'Cuisine_middle eastern / gulf']

In [26]:
# Perform Flight level aggregations - demographics

# columns to be grouped
group_by_cols = ['flight_number','flight_boarding_pt','flight_boarding_time','destination','menu_cycle','service_category_code']

# columns to be pivoted
pivot_cols = ['age_group_1','gender','country_region']

# create a static/constant col to perform sum
M1 = M1.withColumn("values", lit(1))

# drop dups and store in temp1 dataframe
temp1 = M1[['flight_number','flight_boarding_pt','flight_boarding_time','pax_id','menu_cycle','destination','service_category_code',
            'age_group_1','gender', 'country_region', 'values']].drop_duplicates()

#temp1.count() -- 16223


In [27]:
# pivot on 'age_group_1' column
temp1_demographics  = temp1.groupby(group_by_cols).pivot(pivot_cols[0]).agg(sum(temp1.values))

#pivot on 'gender' column
tempdf2 = temp1.groupby(group_by_cols).pivot(pivot_cols[1]).agg(sum(temp1.values))

#pivot on 'country_region' column , need to drop NaNs or replace them, else group by doesn't work
temp1 = temp1.fillna({'country_region':'Unknown country region'})

tempdf3 = temp1.groupby(group_by_cols).pivot('country_region').agg(sum(temp1.values))

In [28]:
# Merge all 3 pivot tables

tempdf2 = tempdf2.join(tempdf3, on=group_by_cols, how='inner')

temp1_demographics=temp1_demographics.join(tempdf2, on = group_by_cols, how='inner')

#temp1_demographics.show()

In [29]:
print("temp1_demographics after one hot for dishsubcategory and cuisine")
temp1_demographics.count()

temp1_demographics after one hot for dishsubcategory and cuisine
224

In [30]:
# merge demographics and dishsubcuisine dfs

temp1 = temp1_demographics.join(temp1_dishsub_cuisine,  on = ['flight_number', 'flight_boarding_pt', 
                               'flight_boarding_time'] , how='inner')


In [31]:
print("temp1 after merging temp1_demographics and temp_dishsubcuisine")
temp1.count()

temp1 after merging temp1_demographics and temp_dishsubcuisine
678

In [32]:
# flight_boarding_time is a string, convert to timestamp and extract date components

temp1 = temp1.withColumn('flight_boarding_time_conv',
                         unix_timestamp(temp1['flight_boarding_time'],'yyyyMMddHHmm').cast("timestamp"))

temp1 = temp1.withColumn("year",year("flight_boarding_time_conv"))

temp1 = temp1.withColumn("month",month("flight_boarding_time_conv"))
                         
temp1 = temp1.withColumn("quarter",quarter("flight_boarding_time_conv"))
                         
temp1 = temp1.withColumn("week",weekofyear("flight_boarding_time_conv"))

temp1 = temp1.withColumn("day",dayofmonth("flight_boarding_time_conv"))

temp1 = temp1.withColumn('dayofweek',(date_format(temp1.flight_boarding_time_conv, 'u')-1))


In [33]:
#date_format(temp1.flight_boarding_time, 'u') 

#temp1.select(dayofweek('day').alias('dayweek')).collect()
#df.select(dayofweek('dt').alias('day')).collect()
#temp1.select(date_trunc('year', temp1.flight_boarding_time).alias('year')).collect()
#temp1.select(date_trunc('day', temp1.flight_boarding_time).alias('dow')).collect()

#temp1.select(dayofmonth('flight_boarding_time').alias('day')).collect()

In [34]:
#temp1.select('flight_boarding_time','year','month','quarter','week','day').show()

In [35]:
# Modify column names
 
new_list1= list(map(lambda x: x.replace("Cuisine_", ""), temp1.columns))

temp1 = temp1.toDF(*new_list1)


In [36]:
# Modify column names

new_list2= list(map(lambda x: x.replace("dishsubcategory_", ""), temp1.columns))

temp1 = temp1.toDF(*new_list2)

#temp1.columns

In [37]:
# Rename certain Columns

temp1 = temp1.withColumnRenamed("Poultry", "Meal_Poultry")
temp1 = temp1.withColumnRenamed("Red Meat", "Meal_Red Meat")  
temp1 = temp1.withColumnRenamed("Seafood", "Meal_Seafood")
temp1 = temp1.withColumnRenamed("Pasta or Vegetarian", "Meal_Pasta or Vegetarian")

temp1.columns

['flight_number', 'flight_boarding_pt', 'flight_boarding_time', 'destination', 'menu_cycle', 'service_category_code', 'Adults', 'Children', 'Elders', 'Middle Aged', 'Teenagers', 'Unknown', 'C', 'F', 'M', '', 'Africa', 'Americas', 'Asia', 'China', 'Europe', 'Germany', 'Gulf', 'India', 'Indonesia', 'Iran', 'Italy', 'Japan', 'Mauritus', 'Northern Africa', 'Oceania', 'Thailand', 'Unknown country region', 'Meal_Pasta or Vegetarian', 'Meal_Poultry', 'Meal_Red Meat', 'Meal_Seafood', 'Unknown Cuisine', 'asian', 'asian / japanese', 'continental / european', 'indian', 'middle eastern', 'middle eastern / gulf', 'flight_boarding_time_conv', 'year', 'month', 'quarter', 'week', 'day', 'dayofweek']

In [38]:
temp1.count()

678

In [39]:
temp1.select('flight_number','flight_boarding_pt', 'flight_boarding_time','Meal_Poultry','Meal_Red Meat','Meal_Seafood','Meal_Pasta or Vegetarian').show(9)

+-------------+------------------+--------------------+------------+-------------+------------+------------------------+
|flight_number|flight_boarding_pt|flight_boarding_time|Meal_Poultry|Meal_Red Meat|Meal_Seafood|Meal_Pasta or Vegetarian|
+-------------+------------------+--------------------+------------+-------------+------------+------------------------+
|         0073|               DXB|        201806260820|           0|            0|           1|                       0|
|         0073|               DXB|        201806260820|           1|            0|           0|                       0|
|         0073|               DXB|        201806260820|           0|            1|           0|                       0|
|         0031|               DXB|        201806241125|           1|            0|           0|                       0|
|         0031|               DXB|        201806241125|           0|            0|           1|                       0|
|         0031|               DX

In [40]:
#temp1.where(col("Cuisine").isNull()).count() -- 530

In [41]:
list_melt = temp1.columns
list_melt = [e for e in list_melt if e not in ('Meal_Poultry', 'Meal_Red Meat', 'Meal_Seafood','Meal_Pasta or Vegetarian')]
#list_melt

In [42]:
melt_cols= temp1.columns
melt_cols = [e for e in melt_cols if e in ('Meal_Poultry', 'Meal_Red Meat', 'Meal_Seafood','Meal_Pasta or Vegetarian')]
melt_cols

['Meal_Pasta or Vegetarian', 'Meal_Poultry', 'Meal_Red Meat', 'Meal_Seafood']

In [43]:
from pyspark.sql import DataFrame
from typing import Iterable

def melt_df(
        df: DataFrame,
        id_vars: Iterable[str],# columns to remain as is
        value_vars: Iterable[str], # columns to convert to rows
        var_name: str="variable", value_name: str="value") -> DataFrame:
    """Convert :class:`DataFrame` from wide to long format."""

    # Create array<struct<variable: str, value: ...>>
    _vars_and_vals = array(*(
        struct(lit(c).alias(var_name), col(c).alias(value_name))
        for c in value_vars))

    # Add to the DataFrame and explode
    _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))

    cols = id_vars + [
            col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
    return _tmp.select(*cols)




In [44]:
df1 = melt_df(temp1, list_melt, melt_cols, 'variable', 'Meal')

In [45]:
print("df1 after melting temp1")
df1.count()

df1 after melting temp1
2712

In [46]:
df1.select('flight_number','variable','Meal').show(25)

+-------------+--------------------+----+
|flight_number|            variable|Meal|
+-------------+--------------------+----+
|         0073|Meal_Pasta or Veg...|   0|
|         0073|        Meal_Poultry|   0|
|         0073|       Meal_Red Meat|   0|
|         0073|        Meal_Seafood|   1|
|         0073|Meal_Pasta or Veg...|   0|
|         0073|        Meal_Poultry|   1|
|         0073|       Meal_Red Meat|   0|
|         0073|        Meal_Seafood|   0|
|         0073|Meal_Pasta or Veg...|   0|
|         0073|        Meal_Poultry|   0|
|         0073|       Meal_Red Meat|   1|
|         0073|        Meal_Seafood|   0|
|         0031|Meal_Pasta or Veg...|   0|
|         0031|        Meal_Poultry|   0|
|         0031|       Meal_Red Meat|   0|
|         0031|        Meal_Seafood|   1|
|         0031|Meal_Pasta or Veg...|   0|
|         0031|        Meal_Poultry|   0|
|         0031|       Meal_Red Meat|   1|
|         0031|        Meal_Seafood|   0|
|         0031|Meal_Pasta or Veg..

In [47]:
split_col = split(df1['variable'], '_')
df1 = df1.withColumn('tmp', split_col.getItem(0))
df1 = df1.withColumn('cat', split_col.getItem(1))

In [48]:
drop_list = ['tmp', 'variable']

df1=df1.select([column for column in df1.columns if column not in drop_list])

df1=df1.orderBy('flight_number', 'flight_boarding_pt', 'flight_boarding_time')

df1 = df1[df1.Meal == 1]

In [49]:
#M1.select('flight_number','flight_boarding_pt', 'flight_boarding_time','pax_id').show(5)

In [50]:
pax_count = M1[['flight_number','flight_boarding_pt', 'flight_boarding_time','pax_id']].groupby(['flight_number', 'flight_boarding_pt', 
                           'flight_boarding_time']).agg(countDistinct('pax_id').alias("pax_count"))


#pax_count.count() -- 224

In [51]:
print("pax_count")
pax_count.count()

pax_count
224

In [52]:

df1 = df1.join(pax_count,  on = ['flight_number', 'flight_boarding_pt', 
                               'flight_boarding_time'],how='left')


df1 = df1.withColumnRenamed("cat", "dishsubcategory")
df1 = df1.withColumnRenamed("service_category_code", "itemcategory")  
 

In [53]:
print("df1 count after joining df1 with pax_count")
df1.count()

df1 count after joining df1 with pax_count
678

In [54]:
df1 = df1.withColumn("itemcategory", \
                   when(df1["itemcategory"] == "L", "Lunch").otherwise(df1["itemcategory"]))

In [55]:
df1 = df1.withColumn("itemcategory", \
                   when(df1["itemcategory"] == "D", "Dinner").otherwise(df1["itemcategory"]))

In [56]:
df1 = df1.withColumnRenamed("", "Unknown column")
#df1.select('Unknown column').distinct().show()

#M1.select('dishsubcategory').distinct().show()

In [57]:
df1 = df1.withColumn('flight_boarding_time',unix_timestamp(temp1['flight_boarding_time'],'yyyyMMddHHmm').cast("timestamp"))


In [58]:
drop_list=['flight_boarding_time_conv','Unknown','Unknown column','Unknown country region','Unknown Cuisine']
df1=df1.select([column for column in df1.columns if column not in drop_list])
#df1.columns

In [59]:
df1.createOrReplaceTempView("mytempTable") 

In [60]:
sqlContext.sql("create table ek_meals_ops_future_test_pyspark_test as select * from mytempTable");

'`default`.`ek_meals_ops_future_test_pyspark_test` already exists.;'
Traceback (most recent call last):
  File "/usr/hdp/current/spark2-client/python/pyspark/sql/context.py", line 384, in sql
    return self.sparkSession.sql(sqlQuery)
  File "/usr/hdp/current/spark2-client/python/pyspark/sql/session.py", line 545, in sql
    return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
  File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/hdp/current/spark2-client/python/pyspark/sql/utils.py", line 69, in deco
    raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: '`default`.`ek_meals_ops_future_test_pyspark_test` already exists.;'



In [61]:
#df1.select('pax_count').show()

In [62]:
df1.count()

678

SELECT MY.FLIGHT_NUMBER, MY.FLIGHT_BOARDING_PT,MY.FLIGHT_BOARDING_TIME,MY.ITEMCATEGORY,MY.DISHSUBCATEGORY,MY.MENU_CYCLE,MY.DESTINATION,
my.`flight_number` - p.`flight_number` as `flight_number_diff`,
my.`flight_boarding_pt` - p.`flight_boarding_pt` as `flight_boarding_pt_diff`,
my.`flight_boarding_time` - p.`flight_boarding_time` as `flight_boarding_time_diff`,
my.`destination` - p.`destination` as `destination_diff`,
my.`menu_cycle` - p.`menu_cycle` as `menu_cycle_diff`,
my.`itemcategory` - p.`itemcategory` as `itemcategory_diff`,
my.`adults` - p.`adults` as `adults_diff`,
my.`children` - p.`children` as `children_diff`,
my.`elders` - p.`elders` as `elders_diff`,
my.`middle aged` - p.`middle aged` as `middle aged_diff`,
my.`teenagers` - p.`teenagers` as `teenagers_diff`,
my.`c` - p.`c` as `c_diff`,
my.`f` - p.`f` as `f_diff`,
my.`m` - p.`m` as `m_diff`,
my.`africa` - p.`africa` as `africa_diff`,
my.`americas` - p.`americas` as `americas_diff`,
my.`asia` - p.`asia` as `asia_diff`,
my.`china` - p.`china` as `china_diff`,
my.`europe` - p.`europe` as `europe_diff`,
my.`germany` - p.`germany` as `germany_diff`,
my.`gulf` - p.`gulf` as `gulf_diff`,
my.`india` - p.`india` as `india_diff`,
my.`indonesia` - p.`indonesia` as `indonesia_diff`,
my.`iran` - p.`iran` as `iran_diff`,
my.`italy` - p.`italy` as `italy_diff`,
my.`japan` - p.`japan` as `japan_diff`,
my.`mauritus` - p.`mauritus` as `mauritus_diff`,
my.`northern africa` - p.`northern africa` as `northern africa_diff`,
my.`oceania` - p.`oceania` as `oceania_diff`,
my.`thailand` - p.`thailand` as `thailand_diff`,
my.`asian` - p.`asian` as `asian_diff`,
my.`asian / japanese` - p.`asian / japanese` as `asian / japanese_diff`,
my.`continental / european` - p.`continental / european` as `continental / european_diff`,
my.`indian` - p.`indian` as `indian_diff`,
my.`middle eastern` - p.`middle eastern` as `middle eastern_diff`,
my.`middle eastern / gulf` - p.`middle eastern / gulf` as `middle eastern / gulf_diff`,
my.`year` - p.`year` as `year_diff`,
my.`month` - p.`month` as `month_diff`,
my.`quarter` - p.`quarter` as `quarter_diff`,
my.`week` - p.`week` as `week_diff`,
my.`day` - p.`day` as `day_diff`,
my.`dayofweek` - p.`dayofweek` as `dayofweek_diff`,
my.`meal` - p.`meal` as `meal_diff`,
my.`dishsubcategory` - p.`dishsubcategory` as `dishsubcategory_diff`,
my.`pax_count` - p.`pax_count` as `pax_count_diff`
 FROM ek_meals_ops_future_test_pyspark_test MY, ek_meals_ops_future_test_pyspark_test_prudvi P 
 WHERE (MY.FLIGHT_NUMBER=P.FLIGHT_NUMBER 
		AND MY.FLIGHT_BOARDING_TIME=P.FLIGHT_BOARDING_TIME 
		AND MY.FLIGHT_BOARDING_PT=P.FLIGHT_BOARDING_PT 
		AND MY.ITEMCATEGORY=P.ITEMCATEGORY  
		AND MY.DISHSUBCATEGORY=P.DISHSUBCATEGORY  
		AND MY.MENU_CYCLE=P.MENU_CYCLE  
		AND MY.DESTINATION=P.DESTINATION )