# Load the data

In [1]:
import pandas as pd
import numpy as np

features = [
    "ts", "event_type", "user_id", "date_from",
    "date_to", "origin", "destination", "num_adults",
    "num_children"
]

df = pd.read_csv("../task/events_1_1.csv", skiprows=1, names=features)

# Cleanup

In [2]:
#
# With a smal number of missing values for this field it is safe to replace missing with 1.
# But I found that rows with 0 adults are in general corrupted (missing other fields)
# so it's better to remove them. After that Facets shows 0% missing values.
#
# train_data['num_adults'].replace(0, 1, inplace=True)

df.drop(df[df['num_adults'] == 0].index, inplace=True)

# Deriving features

In [3]:
#
# Now lets calculate distances (miles) although,
# it is not a measure of an actual flight itenerary.
#

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points.
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))

    return 3956 * c # 6367 for km

# Reading airport cordinates and remove duplicated rows
df_iata = pd.read_csv("../task/iata_1_1.csv", skiprows=1, names=['iata_code', 'lat', 'lon'], index_col='iata_code')
df_iata = df_iata[~df_iata.index.duplicated(keep='first')]

# Map origin, destination of the data set with coordinates
df['origin_lon'] = df_iata.loc[df['origin'].values]['lon'].values
df['origin_lat'] = df_iata.loc[df['origin'].values]['lat'].values
df['destination_lon'] = df_iata.loc[df['destination'].values]['lon'].values
df['destination_lat'] = df_iata.loc[df['destination'].values]['lat'].values

# Calculate the distance
df['distance'] = haversine(df['origin_lon'], df['origin_lat'], df['destination_lon'], df['destination_lat'])
df = df.astype({'distance': 'int32'})

In [4]:
#
# In travel dates encode several meaningful metrics like:
#    duration, advanced purchase date, seasonaly,
#    weekends usage, search moment (morning, afternoon, evenin, night),
#    query freq,
#
# Lets derive some of them.
#

# Convert date strings to datetime
df['ts'] = pd.to_datetime(df['ts'], infer_datetime_format=True)
df['ts_date'] = pd.to_datetime(df['ts'].dt.date)
df['date_from'] = pd.to_datetime(df['date_from'], infer_datetime_format=True)
df['date_to'] = pd.to_datetime(df['date_to'], infer_datetime_format=True)

# Days of week for From and To dates (Monday=0, Sunday=6)
df['date_from_dow'] = (df['date_from'].dt.dayofweek).astype(str)
df['date_to_dow'] = (df['date_to'].dt.dayofweek).astype(str)


# Convert timedelta into integer days
df['adv_purchase'] = ((df['date_from'] - df['ts_date']) / np.timedelta64(1, 'D')).astype(int)
df['duration'] = ((df['date_to'] - df['date_from']) / np.timedelta64(1, 'D')).astype(int)

### Vusial appiarance for the derived features

![Derived features](files/transform_derived.png "Derived features")

# From events log to sessions

Traning on a pure transactional log seems too narrow. Some sort of an algorithm (recurrent, attention) could be useful to include timeline context into the training but it is always better to start **simple**. Yet some aggregations over transactional log can help extend the context. Switching to search sessions will help with that.

Search sessions are built with the set of following rules (as in the previous notebook):

 * time diff since the last query is > 120 min *(120 min is both enough and conforms with the dataset)*
 * booking event happens *(after booking we start a new session)*

After session_ids are assigned we can compute useful aggregations and include in training. Worth to mention that this doesn't break inference at production where you track sessions and refine them as users interact with a system.

For categorical features aggregations are inconvinient without special treatment. Multi-hot encoding or embeddings could help resolving this but here I simplify the case and take the last origin-destination of a session (but compute  counters for origin/destination changes).

In [5]:
# Sorting by user, ts
df.sort_values(by=['user_id', 'ts'], inplace=True)

# Bit maps along the sorted dataset
time_diff = df['ts'].diff()
gt_120min = time_diff > pd.Timedelta(minutes=120)
diff_user = df.user_id != df.user_id.shift()
diff_event = df.event_type.shift() == 'book'


# Applying the maps and summing (cummilative) where a break is found
df['session_id'] = (diff_user | gt_120min | diff_event).cumsum()

In [7]:
# Lets record session step time and further aggregate on it
df['time_diff_min'] = (df.groupby(['session_id'])['ts'].diff() / np.timedelta64(1, 'm')).fillna(0).astype(int)

df_group = df.groupby(['session_id'])

sdf = pd.DataFrame({})
sdf['user_id']= df_group['user_id'].agg(min)
sdf['start']= df_group['ts'].agg(min)
sdf['date']= sdf['start'].dt.date
sdf['date_dow'] = (sdf['start'].dt.dayofweek).astype(str)
sdf['end']= df_group['ts'].agg(max)
sdf['sess_time'] = ((sdf['end'] - sdf['start']) / np.timedelta64(1, 'm')).astype(int)
sdf['query_freq_mean'] = df_group['time_diff_min'].agg('mean')
sdf['searches']= df_group['event_type'].agg(lambda x: sum(e == 'search' for e in x))
sdf['booked']= df_group['event_type'].agg(lambda x: sum(e == 'book' for e in x))

# take the last origin/destination within a session
sdf['origin']= df_group['origin'].agg('last')
sdf['destination']= df_group['destination'].agg('last')

# count unique origin/destination counts within a session
sdf['origin_nuniq']= df_group['origin'].agg(lambda x: x.nunique())
sdf['destination_nuniq']= df_group['destination'].agg(lambda x: x.nunique())

sdf['num_adults']= df_group['num_adults'].agg('last')
sdf['num_children']= df_group['num_children'].agg('last')
sdf['num_adults_nuniq']= df_group['num_adults'].agg(lambda x: x.nunique())

sdf['date_from']= df_group['date_from'].agg('last')
sdf['date_to']= df_group['date_to'].agg('last')
sdf['date_from_nuniq']= df_group['date_from'].agg(lambda x: x.nunique())
sdf['date_to_nuniq']= df_group['date_to'].agg(lambda x: x.nunique())
sdf['date_from_dow']= df_group['date_from_dow'].agg('last')
sdf['date_to_dow']= df_group['date_to_dow'].agg('last')

sdf['distance']= df_group['distance'].agg('last')
sdf['distance_mean']= df_group['distance'].agg('mean')

sdf['adv_purchase']= df_group['adv_purchase'].agg('last')
sdf['adv_purchase_mean']= df_group['adv_purchase'].agg('mean')

sdf['duration']= df_group['duration'].agg('last')
sdf['duration_mean']= df_group['duration'].agg('mean')


In [8]:
# Save for the traning step
sdf.to_csv('./files/search_sessions.csv')

# Sessions dataset visualization

Data preparation is finished lets observe the data with the `Facets` tool. In the next step this data will source a few models. 

[**Linear model**](model.ipynb)

[**DNN model**](model.ipynb)

In [9]:
# Display the facets overview visualization for this data
from IPython.core.display import display, HTML
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
import base64

# derived_features = [
#     'adv_purchase', 'duration', 'distance',
#     'date_from_dow', 'date_to_dow'
# ]

# draw_df = df[derived_features]
# draw_df_book = df[df['event_type'] == 'book'][derived_features]

derived_features = [
    'date', 'date_dow', 'sess_time', 'query_freq_mean', 'searches',
    'booked', 'origin_nuniq', 'destination_nuniq',
    'num_adults', 'num_adults_nuniq', 'date_from_nuniq',
    'date_to_nuniq', 'date_from_dow', 'date_to_dow',
    'distance', 'distance_mean', 'adv_purchase', 'adv_purchase_mean',
    'duration', 'duration_mean'
]

draw_df = sdf[sdf['booked'] == 0][derived_features]
draw_df_book = sdf[sdf['booked'] == 1][derived_features]


# Prepare data for Facets
gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{'name': 'notbook', 'table': draw_df}, {'name': 'book', 'table': draw_df_book}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
html = HTML_TEMPLATE.format(protostr=protostr)
display(HTML(html))