In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# read the data
df = pd.read_csv("agoda_cancellation_train.csv")

# retract data
is_cancelled = df['cancellation_datetime'].fillna(0)
y = is_cancelled.where(is_cancelled == 0, 1).astype('int')

# delete nan values
y = y.drop(y[df['hotel_country_code'].isna()].index)
df = df.drop(df[df['hotel_country_code'].isna()].index)
y = y.drop(y[df['origin_country_code'].isna()].index)
df = df.drop(df[df['origin_country_code'].isna()].index)
for column in df.columns:
  null_values = df[df[column].isna()]
  if null_values.shape[0] > 0 and column != 'cancellation_datetime':
    df[column] = df[column].fillna(0)

# New dataframe with only relevant columns:

# accommadation_type_name, charge_option, original_payment_type to dummies

# might add original_payment_currency and general countries

# is_first_booking and is_user_logged_in to 1,0

First_MVP = df.copy()
First_MVP = First_MVP.drop(['h_booking_id', 'hotel_live_date', 'h_customer_id', 'booking_datetime', 'checkin_date', 'checkout_date', 'hotel_id', 
                            'hotel_country_code', 'customer_nationality', 'guest_nationality_country_name', 'origin_country_code', 'language',
                            'cancellation_datetime', 'hotel_area_code', 'hotel_brand_code', 'hotel_chain_code', 'hotel_city_code', 'original_payment_method', 
                            'original_payment_currency', 'cancellation_policy_code'],axis=1)


# change to dummies:

accommadation_type_name = First_MVP['accommadation_type_name']
First_MVP = pd.concat([First_MVP, pd.get_dummies(accommadation_type_name)], axis=1)

charge_option = First_MVP['charge_option']
First_MVP = pd.concat([First_MVP, pd.get_dummies(charge_option)], axis=1)

original_payment_type = First_MVP['original_payment_type']
First_MVP = pd.concat([First_MVP, pd.get_dummies(original_payment_type)], axis=1)

First_MVP = First_MVP.drop(['accommadation_type_name', 'charge_option', 'original_payment_type'],axis=1)

# Change True/False to 1/0
First_MVP['is_first_booking'] = First_MVP['is_first_booking'].where(First_MVP['is_first_booking'] != True, 1).astype('int')
First_MVP['is_first_booking'] = First_MVP['is_first_booking'].where(First_MVP['is_first_booking'] == True, 0).astype('int')
First_MVP['is_user_logged_in'] = First_MVP['is_user_logged_in'].where(First_MVP['is_user_logged_in'] != True, 1).astype('int')
First_MVP['is_user_logged_in'] = First_MVP['is_user_logged_in'].where(First_MVP['is_user_logged_in'] == True, 0).astype('int')

In [None]:
# TEST HANDLING:

# read the data
df_test = pd.read_csv("test_set_week_1.csv")

# retract data

# delete nan values
df_test = df_test.drop(df_test[df_test['hotel_country_code'].isna()].index)
df_test = df_test.drop(df_test[df_test['origin_country_code'].isna()].index)
for column in df_test.columns:
  null_values = df_test[df_test[column].isna()]
  if null_values.shape[0] > 0 and column != 'cancellation_datetime':
    df_test[column] = df_test[column].fillna(0)

# New dataframe with only relevant columns:

# accommadation_type_name, charge_option, original_payment_type to dummies

# might add original_payment_currency and general countries

# is_first_booking and is_user_logged_in to 1,0

test_data = df_test.copy()
test_data = test_data.drop(['h_booking_id', 'hotel_live_date', 'h_customer_id', 'booking_datetime', 'checkin_date', 'checkout_date', 'hotel_id', 
                            'hotel_country_code', 'customer_nationality', 'guest_nationality_country_name', 'origin_country_code', 'language',
                            'hotel_area_code', 'hotel_brand_code', 'hotel_chain_code', 'hotel_city_code', 'original_payment_method', 
                            'original_payment_currency', 'cancellation_policy_code'],axis=1)


# change to dummies:

accommadation_type_name = test_data['accommadation_type_name']
test_data = pd.concat([test_data, pd.get_dummies(accommadation_type_name)], axis=1)

charge_option = test_data['charge_option']
test_data = pd.concat([test_data, pd.get_dummies(charge_option)], axis=1)

original_payment_type = test_data['original_payment_type']
test_data = pd.concat([test_data, pd.get_dummies(original_payment_type)], axis=1)

test_data = test_data.drop(['accommadation_type_name', 'charge_option', 'original_payment_type'],axis=1)

# Change True/False to 1/0
test_data['is_first_booking'] = test_data['is_first_booking'].where(First_MVP['is_first_booking'] != True, 1).astype('int')
test_data['is_first_booking'] = test_data['is_first_booking'].where(First_MVP['is_first_booking'] == True, 0).astype('int')
test_data['is_user_logged_in'] = test_data['is_user_logged_in'].where(First_MVP['is_user_logged_in'] != True, 1).astype('int')
test_data['is_user_logged_in'] = test_data['is_user_logged_in'].where(First_MVP['is_user_logged_in'] == True, 0).astype('int')

# Fill dummies with "0" if they don't exist in the test data:

for column in First_MVP.columns:
  if column not in test_data.columns:
    false_array = [False] * test_data.shape[0]
    test_data[column] = test_data['is_first_booking'].where(false_array, 0).astype('int')

In [None]:
logisticRegr = LogisticRegression()
logisticRegr.fit(First_MVP, y)

In [None]:
result = logisticRegr.predict(test_data)
result_df = pd.DataFrame(result)
result_df.to_csv("path/to/file.csv") # Of course there will be two columns so we deleted the one for indexes...