In [None]:
import pandas as pd
import numpy as np
import calendar
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer

In [None]:
df_bookings = pd.read_csv("data/hotel_bookings.csv")
pd.set_option('display.max_columns', None)
df_bookings.head()

In [None]:
cat_columns = ['hotel', 'is_canceled',
       'arrival_date_month', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'reserved_room_type',
       'assigned_room_type', 'deposit_type', 'agent',
       'company', 'customer_type',
       'arrival_date_year',
       'arrival_date_week_number',
       'arrival_date_day_of_month']

encoded_cat_columns = [col for col in cat_columns if col not in {'company', 'agent', 'is_canceled'}]

num_columns = ['lead_time', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests']


## Rozdelenie datasetu

Dataset rozdelime na trenovaciu a testovaciu cast v pomere 70:30.

In [None]:
df_train, df_test = train_test_split(df_bookings, test_size=0.3)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

## Odstranenie reservation_status

Atribut reservation status vyjadruje status rezervacie, a to bud "Check-Out", "Cancelled" alebo "Waiting".
Na zaklade jeho hodnoty vieme s urcitostou povedat, ci bola rezervacia zrusena, preto je potrebne ho z datasetu odstranit.
Predikovat budeme hodnotu atributu "is_canceled", ktory binarne vyjadruje zrusenie rezervacie.

In [None]:
def drop_reservation_status(df):
    return df.drop(columns=['reservation_status'])

## Nahradenie chybajucich hodnot

Na zaklade analyzy sme identifikovali 4 stlpce s chybajucimi hodnotami, a to “children”, “country”, “company” a “agent”.
V prípade stĺpca “children” nahradime chýbajúce hodnoty najčastejšou hodnotou (0).
V prípade stlpca “country” nahradime chýbajúce hodnoty hodnotou “other”.
V prípade stlpcov “company” a "agent" nahradime chýbajúce hodnoty hodnotou 0, nakolko sa v tychto stlpcoch nachadzaju
id spolocnosti/agentov a hodnota 0 sa v nich nevyskytuje, preto bude toto cislo reprezentovat nezname spolocnosti/agentov.

In [None]:
def get_missing_value_replacer(df):
    return {
        'children': df['children'].value_counts().idxmax(),
        'country': 'other',
        'company': 0,
        'agent': 0
    }

def replace_missing_values(df, replacer):
    for col, value in replacer.items():
        df[col] = df[col].fillna(value)

## Normalizacia numerickych atributov

Numericke atributy normalizujeme do intervalu <0, 1>.

In [None]:
def get_numerical_normalizer(df, numerical_columns):
    return {col: df[col].max(axis=0) for col in numerical_columns}

def normalize_numerical_columns(df, normalizers):
    for col, norm in normalizers.items():
        df[col] = df[col] / norm

## Osetrenie vychylenych hodnot

V numerickych stlpcoch osetrime vychylene hodnoty tak, ze prilis nizke hodnoty nahradime 5 percentilom a
prilis vysoke hodnoty nahradime 95 percentilom. Hodnoty zaokruhlime na cele cisla, nakolko vo vacsine pripadov sa jedna
o celociselne hodnoty, ako napriklad pocet ludi.

In [None]:
def get_outlier_replacer(df, numerical_columns):
    replacer = {}
    for col in numerical_columns:
        replacer[col] = {
            'upper_quantile': np.around(np.percentile(df[col], 95), decimals=0),
            'upper_outlayer': 1.5*np.percentile(df[col], 75),
            'lower_quantile': np.around(np.percentile(df[col], 5), decimals=0),
            'lower_outlayer': .5*np.percentile(df[col], 25)
        }
    return replacer

def replace_outliers(df, replacer):
    for col, value in replacer.items():
        df.loc[df[col] > value['upper_outlayer'], col] = value['upper_quantile']
        df.loc[df[col] < value['lower_outlayer'], col] = value['lower_quantile']

## Pridanie atributu zachytavajuceho casovu postupnost

Vytvorenie stĺpca, obsahujuceho kompletný dátum vo formáte rok-mesiac-deň.

In [None]:
def make_arrival_date_column(df):
    cal = dict((v,k) for k,v in enumerate(calendar.month_name))
    df['arrival_datetime'] = pd.to_datetime((df.arrival_date_year*10000+df.arrival_date_month.map(cal)*100+df.arrival_date_day_of_month).apply(str),format='%Y%m%d')

## Ziskanie dat na predspracovanie datasetu

Z trenovacej sady si vypocitame data potrebne na predspracovanie.

In [None]:
missing_value_replacer = get_missing_value_replacer(df_train)
numerical_normalizer = get_numerical_normalizer(df_train, num_columns)
outlier_replacer = get_outlier_replacer(df_train, num_columns)

## Aplikovanie predspracovania datasetu

Data ziskane z trenovacej sady pouzijeme na predspracovanie, ktore vykoname na trenovacej aj testovacej sade

In [None]:
def preprocess(df, missing_value_replacer, numerical_normalizer, outlier_replacer):
    df = drop_reservation_status(df)
    replace_missing_values(df, missing_value_replacer)
    normalize_numerical_columns(df, numerical_normalizer)
    replace_outliers(df, outlier_replacer)
    make_arrival_date_column(df)
    return df

In [None]:
df_train = preprocess(df_train, missing_value_replacer, numerical_normalizer, outlier_replacer)
df_test = preprocess(df_test, missing_value_replacer, numerical_normalizer, outlier_replacer)

## Zakodovanie kategorickych atributov

Na zakodovanie kategorickych atributov pouzijeme one-hot encoding. Ten nafitujeme len na trenovacich datach
a nasledne pouzijeme na trenovacej aj testovacej sade.

In [None]:
def get_one_hot_encoder(df, encoded_categorical_columns):
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoder.fit(df[encoded_categorical_columns])
    return encoder

def one_hot_encode(df, encoder, columns):
    encoded = pd.DataFrame(encoder.transform(df[encoded_cat_columns]).toarray())
    df = df.join(encoded)
    return df.drop(columns=columns)

In [None]:
one_hot_encoder = get_one_hot_encoder(df_train, encoded_cat_columns)
df_train = one_hot_encode(df_train, one_hot_encoder, encoded_cat_columns)
df_test = one_hot_encode(df_test, one_hot_encoder, encoded_cat_columns)

In [None]:
X_train = df_train.drop(columns=['is_canceled', 'reservation_status_date', 'arrival_datetime'])
y_train = df_train['is_canceled']

X_test = df_test.drop(columns=['is_canceled', 'reservation_status_date', 'arrival_datetime'])
y_test = df_test['is_canceled']

In [None]:
X_train.to_csv('data/X_train.csv', index=False)
y_train.to_csv('data/y_train.csv', header=False)

X_test.to_csv('data/X_test.csv', index=False)
y_test.to_csv('data/y_test.csv', header=False)

## Feature selection

Výber zaujímavých atribútov vykonáme pomocou metódy SelectFromModel z knižnice scikit learn.
Ten natrénujeme na trénovacej sade s použitím rozhodovacieho stromu.
Natrénovaný selector aplikujeme na trénovacie aj testovacie dáta.

In [None]:
selector = SelectFromModel(estimator=DecisionTreeClassifier()).fit(X_train, y_train)
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

## Predikcia

### a) Pomocou rozhodovacieho stromu.

In [None]:
param_grid = {"max_depth": [None, 1, 2, 3],
              "max_leaf_nodes": [None, 3, 5, 10],
              "min_samples_leaf": [1, 2, 5, 10],
              "min_samples_split": [2, 3, 5],
              "random_state": [20, 30, 40, 50],
              "criterion": ["gini", "entropy"],
              "splitter": ["best", "random"],
              "class_weight" : ["balanced", None]
              }

scoring = {'accuracy': make_scorer(accuracy_score),
          'f1': make_scorer(f1_score),
          'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score)}

classifier = DecisionTreeClassifier()
grid_search = GridSearchCV(classifier,
                           param_grid=param_grid,
                           scoring=scoring,
                           refit='f1',
                           n_jobs=-1,
                           verbose=3,
                           cv=7,
                           return_train_score=True)

grid_search.fit(X_train, y_train)
predictions = grid_search.predict(X_test)

pickle.dump( grid_search, open( "dtree_grid_search.p", "wb" ) )
pickle.dump( predictions, open( "dtree_predictions.p", "wb" ) )

### b) Pomocou knn

In [None]:
knn = KNeighborsClassifier()

parameters = {'n_neighbors':[2,3,4,5,10]}

scoring = {'accuracy': make_scorer(accuracy_score),
          'f1': make_scorer(f1_score),
          'precision': make_scorer(precision_score),
          'recall': make_scorer(recall_score)}

grid_search = GridSearchCV(knn,
                     param_grid=parameters,
                    scoring=scoring,
                   refit='f1',
                   n_jobs=-1,
                   verbose=5,
                   cv=7,
                   return_train_score=True)

grid_search.fit(X_train, y_train)
predictions = grid_search.predict(X_test)

pickle.dump( grid_search, open( "knn_grid_search.p", "wb" ) )
pickle.dump( predictions, open( "knn_predictions.p", "wb" ) )