In [None]:
# Obtención de los datos
import pandas as pd

if __name__ == '__main__':
    hotel_df = pd.read_csv('files/hotel_bookings.csv')
    x, y = hotel_df.shape
    print(x, y)

In [None]:
hotel_df.head()

In [None]:
hotel_df.info()

In [None]:
hotel_df.describe()

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
hotel_df.hist(bins=50, figsize=(20,15))
plt.show()


In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
hotel_df.boxplot(figsize=(38, 10))
plt.show()

In [None]:
hotel_df['is_canceled'].hist(bins=2)
print(hotel_df['is_canceled'].value_counts())
print("proporcion de clases: %s %%" % (round(class_1 / class_2, 2)))

In [None]:
# Creating a data set for train
# 90/10
import numpy as np
from zlib import crc32

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]


if __name__ == '__main__':
    np.random.seed(1024)
    train_set, test_set = split_train_test(hotel_df, 0.1)
    train_set.to_csv('files/train.csv', index=False)
    test_set.to_csv('files/test.csv', index=False)
    print(len(train_set), len(test_set))

In [None]:
#EDA
#Experimenting with atributes combiaitions
#New atributes
train_set = pd.read_csv('files/train.csv')
train_set['total_guests'] = train_set['adults'] + train_set['children'] + train_set['babies']
train_set['total_days'] = train_set['stays_in_week_nights'] + train_set['stays_in_weekend_nights']
train_set.shape

In [None]:
country_data = pd.DataFrame(train_set.loc[train_set["is_canceled"] == 0]["country"].value_counts())
#country_data.index.name = "country"
country_data.rename(columns={"country": "Number of Guests"}, inplace=True)
total_guests = country_data["Number of Guests"].sum()
country_data["Guests in %"] = round(country_data["Number of Guests"] / total_guests * 100, 2)
country_data["country"] = country_data.index
country_data.loc[country_data["Guests in %"] < 4, "country"] = "Other"

# pie plot
fig = px.pie(country_data,
             values="Number of Guests",
             names="country",
             title="De que paises vienen la mayoria de los huespedes que si cancelaron",
             template="seaborn")
fig.update_traces(textposition="inside", textinfo="value+percent+label")
fig.show()

In [None]:
import plotly.express as px
train_set = pd.read_csv('files/train.csv')

train_set = train_set[train_set['country'] != 'PRT']

country_data_cancelados = train_set.groupby(['country'])['is_canceled'].sum().reset_index().sort_values(by='is_canceled',ascending=False)
country_data_cancelados['% de cancelados'] = round(country_data_cancelados['is_canceled'] / max(country_data_cancelados['is_canceled']) * 100, 2)


guest_map = px.choropleth(country_data_cancelados,
                    locations=country_data_cancelados['country'],
                    color=country_data_cancelados["% de cancelados"], 
                    hover_name=country_data_cancelados['country'], 
                    color_continuous_scale=px.colors.sequential.Reds,
                    title="Países donde existen mas cancelaciones (se excluye a Portugal)")
guest_map.show()

In [None]:
#Por mes
meses_ordenados = ['January', "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
train_set['Mes'] = pd.Categorical(train_set['arrival_date_month'], categories=meses_ordenados, ordered=True)

country_data_cancelados = train_set.groupby(['country','Mes'])['total_guests','is_canceled'].sum().reset_index().sort_values(by='Mes')
country_data_cancelados['% de cancelados'] = round(country_data_cancelados['is_canceled']/ country_data_cancelados['total_guests'] * 100, 2)
country_data_cancelados.dropna(inplace=True)
country_data_cancelados = country_data_cancelados.reset_index()
country_data_cancelados = country_data_cancelados.drop('index',axis=1)

guest_map = px.choropleth(country_data_cancelados,
                    locations=country_data_cancelados['country'],
                    color=country_data_cancelados["total_guests"], 
                    hover_name=country_data_cancelados['country'],
                    animation_frame="Mes",
                    color_continuous_scale=px.colors.sequential.Reds,
                    title="Origen por fecha de paises donde existen mas cancelaciones")
guest_map.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000

guest_map.show()

In [None]:
# Counting adults and children as paying guests only, not babies.
import numpy as np
train_set = pd.read_csv('files/train.csv')
rhotel = train_set[train_set['hotel'] == 'Resort Hotel'].copy()
chotel = train_set[train_set['hotel'] == 'City Hotel'].copy()

rhotel.fillna(1, inplace=True)
rhotel.replace(np.inf, 1, inplace=True)
rhotel["adr_pp"] = rhotel["adr"] / (rhotel["adults"] + rhotel["children"])
chotel.fillna(0, inplace=True)
chotel.replace(np.inf, 0, inplace=True)
chotel["adr_pp"] = chotel["adr"] / (chotel["adults"] + chotel["children"])

room_prices_mothly = train_set[["hotel", "arrival_date_month", "adr_pp"]].sort_values("arrival_date_month")

# order by month:
ordered_months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]
room_prices_mothly["arrival_date_month"] = pd.Categorical(room_prices_mothly["arrival_date_month"], categories=ordered_months, ordered=True)

# barplot with standard deviation:
plt.figure(figsize=(12, 8))
sns.lineplot(x = "arrival_date_month", y="adr_pp", hue="hotel", data=room_prices_mothly, 
            hue_order = ["City Hotel", "Resort Hotel"], ci="sd", size="hotel", sizes=(2.5, 2.5))
plt.title("Room price per night and person over the year", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=45)
plt.ylabel("Price [EUR]", fontsize=16)
plt.show()

In [None]:
corr_matrix = train_set.corr()
corr_matrix['is_canceled'].sort_values(ascending=False)

In [None]:
atributes = ['lead_time', 'previous_cancellations', 'total_of_special_requests', 'required_car_parking_spaces', 'booking_changes']
pd.plotting.scatter_matrix(train_set[atributes], figsize=(12, 8))

In [None]:
train_set.isnull().sum() / len(train_set) * 100

In [None]:
#Preparación de los datos
#X_train = pd.read_csv('files/train.csv')
X_train = train_set.drop('is_canceled', axis=1)
y_train = train_set['is_canceled'].copy()
print(X_train.shape, y_train.shape)

In [None]:
X_train_num = X_train.select_dtypes(exclude=['object'])
print(X_train_num.columns)
#X_train_num[''].unique()
X_train_num.describe()

In [None]:
X_train_cat = X_train.select_dtypes(include=['object', 'category'])
atrs_cat = X_train_cat.columns
atrs_cat

In [None]:
X_train_cat.describe()

In [30]:
#Missing values
from sklearn.impute import SimpleImputer
#For standarization
from sklearn.preprocessing import StandardScaler
#For work encode categorical atrubuts
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
#For do a best a work flow
from sklearn.pipeline import Pipeline
#Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
import pandas as pd

if __name__ == '__main__': 
    ########################################################################
    #Load our data sets
    train_set = pd.read_csv('files/train.csv')
    test_set = pd.read_csv('files/test.csv')
    
    #New atributes
    
    #Train
    train_set['total_guests'] = train_set['adults'] + train_set['children'] + train_set['babies']
    train_set['total_days'] = train_set['stays_in_week_nights'] + train_set['stays_in_weekend_nights']
    #Test
    test_set['total_guests'] = test_set['adults'] + test_set['children'] + test_set['babies']
    test_set['total_days'] = test_set['stays_in_week_nights'] + test_set['stays_in_weekend_nights']
    
    #Preparing data for our model
    
    #Train
    X_train = train_set.drop('is_canceled', axis=1)
    y_train = train_set['is_canceled'].copy()
    #Train
    X_test = test_set.drop('is_canceled', axis=1)
    y_test = test_set['is_canceled'].copy()
    
    #Cleaning data
    
    #Numerical atributs droped
    atrs_n = ['arrival_date_year', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'company', 'agent', 'total_guests']
    #Categorical atributs droped 
    atrs_cat = ['reservation_status', 'reservation_status_date', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type']
    atrs = atrs_cat + atrs_n
    #Train
    X_train = X_train.drop(atrs, axis=1)
    #Test
    X_test = X_test.drop(atrs, axis=1)
    
    #Encoding category type data
    
    #########
    #Train
    X_train_num = X_train.select_dtypes(exclude=['object', 'category']).columns
    X_train_cat = X_train.select_dtypes(include=['object', 'category']).columns
    num_pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy="constant")),
    #('attribs_adder', CombinedAttributesAdder()), #Experimenting with atributes combinations
    ('std', StandardScaler()),#std_scaler#Standarization
    ])

    num_attribs = X_train_num#For get numeric data
    cat_attribs = X_train_cat#For category data
    full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ]) 
    X_train = full_pipeline.fit_transform(X_train)
    #########
    #Test
    X_test_num = X_test.select_dtypes(exclude=['object', 'category']).columns
    X_test_cat = X_test.select_dtypes(include=['object', 'category']).columns
    num_pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy="constant")),
    #('attribs_adder', CombinedAttributesAdder()), #Experimenting with atributes combinations
    ('std', StandardScaler()),#std_scaler#Standarization
    ])

    num_attribs = X_test_num#For get numeric data
    cat_attribs = X_test_cat#For category data
    full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ]) 
    
    X_test = full_pipeline.fit_transform(X_test)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    ########################################################################
    #Loading ours model 
    
    #Logit Regression
    log_reg = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
    log_reg.fit(X_train, y_train)
    y_predict = log_reg.predict(X_test)
    #acc = 0.7884244911634141
    """
    #Decission TeeeClassifer
    tree_clas = DecisionTreeClassifier(ccp_alpha=0.0, criterion='gini',
                       max_depth=30, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')
    tree_clas.fit(X_train, y_train)
    y_predict = tree_clas.predict(X_test)   
    #acc = 0.7987268615461931
    
    #Decission RandomForestClassifier

    rand_clas = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=32, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=285,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=1,
                       warm_start=False)
    rand_clas.fit(X_train, y_train)
    y_predict = rand_clas.predict(X_test)    
    
    0.8546035024693207

    #Decission SVM
    #svm_clas = svm.SVC()
    #svm_clas.fit(X_train, y_train)
    #y_predict = svm_clas.predict(X_test)    
    #acc = 0.8045736304238565
    """


(107451, 37) (107451,) (11939, 37) (11939,)


In [31]:
print(accuracy_score(y_predict, y_test))

0.7884244911634141


In [42]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score

# define models to test:
base_models = [("DT_model", DecisionTreeClassifier(random_state=42)),
               ("RF_model", RandomForestClassifier(random_state=42,n_jobs=-1)),
               ("LR_model", LogisticRegression(random_state=42,n_jobs=-1)),
               #("XGB_model", XGBClassifier(random_state=42, n_jobs=-1))
              ]

param_grid = {'n_estimators': [200, 500],
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth' : [4,5,7],
              'criterion' :['gini', 'entropy']
             }
forest_reg = RandomForestClassifier(random_state=1024, n_jobs=-1)
grid_search = GridSearchCV(estimator=forest_reg, param_grid=param_grid, cv=5, verbose=True)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [7]:
from sklearn.metrics import classification_report
final_model = grid_search.best_estimator_
y_predict = final_model.predict(X_test)
print(classification_report(y_test, y_predict))

NameError: name 'grid_search' is not defined