In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
dateparser = lambda x: pd.Timestamp(x).to_pydatetime().weekday()

df = pd.read_csv('../datasets/agoda_cancellation_train.csv')
week1_df = pd.read_csv('test_set_week_1.csv')

In [3]:
def pre_process(df, flag=False):
    no_use_now = ['h_booking_id', 'language', 'hotel_chain_code', 'hotel_area_code', 'hotel_brand_code',
                 'hotel_city_code', 'origin_country_code', 'h_customer_id', 'guest_nationality_country_name',     
                 'hotel_id', 'customer_nationality', 'hotel_country_code']

    one_hot_cols = ['charge_option', 'accommadation_type_name',
                    'original_payment_method', 'original_payment_type', 'original_payment_currency']

    dates = ['checkin_date', 'checkout_date', 'booking_datetime', 'cancellation_policy_code']

    req = ['request_highfloor', 'request_nonesmoke', 'request_latecheckin', 'request_largebed', 
           'request_twinbeds', 'request_airport', 'request_earlycheckin']

    response = ['cancellation_datetime']

    to_remove = no_use_now + one_hot_cols + dates + response + req 
    
    
    df['checkin_date_month'] = pd.to_datetime(df.checkin_date).apply(lambda d: d.month).astype(int)
    df['checkin_date_weekday'] = pd.to_datetime(df.checkin_date).apply(dateparser).astype(int)

    df['checkout_date_month'] = pd.to_datetime(df.checkout_date).apply(lambda d: d.month).astype(int)
    df['checkout_date_weekday'] = pd.to_datetime(df.checkout_date).apply(dateparser).astype(int)


    df['booking_datetime_month'] = pd.to_datetime(df.booking_datetime).apply(lambda d: d.month).astype(int)
    df['booking_datetime_weekday'] = pd.to_datetime(df.booking_datetime).apply(dateparser).astype(int)

    df['hotel_live_date'] = pd.to_datetime(df.hotel_live_date).apply(lambda d: d.year - 2000).astype(int)


    df["special_request"] = np.where((df.request_highfloor == 1.0)  | (df.request_nonesmoke == 1.0) | 
                                     (df.request_latecheckin == 1.0) | (df.request_largebed == 1.0) | 
                                     (df.request_twinbeds == 1.0) | (df.request_airport == 1.0) | 
                                     (df.request_earlycheckin == 1.0), 1.0, 0.0)

    for col in one_hot_cols:
        df = pd.concat([df, pd.get_dummies(df[col])], axis=1)


    df.is_user_logged_in = df.is_user_logged_in.astype(int)
    df.is_first_booking = df.is_first_booking.astype(int)


    df['first_policy_days'] = df.cancellation_policy_code.apply(lambda s: ((s.split('_')[0]).split('D')[0]))
    df['first_policy'] = df.cancellation_policy_code.apply(lambda s: (s.split('_')[0]).split('D')[1] if (len((s.split('_')[0]).split('D')) == 2) else np.nan)

    df = df[(df.first_policy.apply(str) != 'nan')]
    df = df[(df.first_policy_days != 'UNKNOWN')]

    df['first_policy_days'] = df.first_policy_days.astype(int)

    df['first_prec'] = np.where(
                        df.first_policy.astype(str).apply(lambda s: (str(s)[-1]) == 'N'),
                        df.first_policy_days.astype(int) / ((pd.to_datetime(df.checkout_date) - pd.to_datetime(df.checkin_date)).apply(lambda d: d.days).astype(int)) * 100,
                        df.first_policy.apply(lambda p: int(str(p)[:-1])))


    df['first_policy'] = df.first_policy.astype(str).apply(lambda s: (str(s)[-1]) == 'N').astype(int)
    
    if flag:
        y = (df.cancellation_datetime.fillna(0) != 0).astype(int)
        df.drop(to_remove, axis=1, inplace=True)
        return df, y
    to_remove.remove('cancellation_datetime')
    df.drop(to_remove, axis=1, inplace=True)
    return df

In [4]:
# pre process:
df, y = pre_process(df, True)
week1_df = pre_process(week1_df)
df = df.loc[:, week1_df.columns]
df = df[[col for col in df.columns if col != 'UNKNOWN']]
week1_df = week1_df[[col for col in week1_df.columns if col != 'UNKNOWN']]

In [5]:
# split data
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=1)

In [6]:
# train model
lr = LogisticRegression(max_iter=12000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=12000)

In [7]:
# test score
lr.score(X_test, y_test)

0.7821709874289291

In [8]:
# predict:
week1_y = lr.predict(week1_df)

In [10]:
# results:
week1_df['predicted_values'] = week1_y
week1_df.to_csv('207042714_315317255_207902537.csv')
week1_df

Unnamed: 0,hotel_live_date,hotel_star_rating,guest_is_not_the_customer,no_of_adults,no_of_children,no_of_extra_bed,no_of_room,original_selling_amount,is_user_logged_in,is_first_booking,...,SGD,THB,TWD,USD,VND,ZAR,first_policy_days,first_policy,first_prec,predicted_values
0,17,3.0,0,4,0,0,2,89.32,0,1,...,0,0,0,0,0,0,2,0,100.0,0
1,14,3.0,0,2,0,0,1,135.36,1,1,...,0,1,0,0,0,0,7,1,350.0,0
2,12,3.0,0,4,0,0,2,215.04,1,0,...,0,0,0,0,0,0,3,0,100.0,0
3,17,5.0,0,2,1,0,1,930.67,0,1,...,0,0,0,0,0,0,27,0,100.0,0
4,18,5.0,0,2,0,0,1,233.10,0,0,...,0,0,1,0,0,0,3,0,50.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,15,3.0,0,4,0,0,2,148.84,0,0,...,0,0,0,0,0,0,1,1,100.0,0
696,12,4.5,0,4,1,0,3,217.08,1,0,...,0,0,0,0,0,0,1,1,100.0,0
697,18,0.0,0,4,0,0,1,97.01,0,1,...,0,0,1,0,0,0,14,0,100.0,1
698,15,3.0,0,2,0,0,1,124.80,0,0,...,0,0,0,0,0,0,1,1,20.0,0
