In [2]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
#import geocoder
from joblib import dump, load
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from pandas_profiling import ProfileReport

In [None]:
data_info = pd.read_csv('hotels_information.csv')
print("data_info = ",data_info.shape)
data_pricing = pd.read_csv('pricing_data.csv')
print("data_pricing = ",data_pricing.shape)

In [None]:
"""
Info. table
"""
def info_table(df):
    tb = pd.DataFrame(df.dtypes)
    tb.columns = ['Type']
        
    num_unique = []
    for i in list(tb.index):
        num_unique.append(len(df[i].unique()))
            
    max_value = []
    min_value = []
    mean_value = []
    std_value = []
    mode_value = []
    for i in list(tb.index):
        if tb.loc[i,'Type'] == 'int64':
            max_value.append(round(df[i].max(),3))
            min_value.append(round(df[i].min(),3))
            mean_value.append(round(df[i].mean(),3))
            std_value.append(round(df.loc[:,i].std(),3))
            mode_value.append(df[i].mode(dropna=True)[0])
        elif tb.loc[i,'Type'] == 'float64':
            max_value.append(round(df[i].max(),3))
            min_value.append(round(df[i].min(),3))
            mean_value.append(round(df[i].mean(),3))
            std_value.append(round(df.loc[:,i].std(),3))
            mode_value.append(df[i].mode(dropna=True)[0])
        else:
            max_value.append('na')
            min_value.append('na')
            mean_value.append('na')
            std_value.append('na')
            mode_value.append(df[i].mode(dropna=True)[0])
        
    tb['Min'] = min_value
    tb['Max'] = max_value
    tb['Mean'] = mean_value
    tb['STD'] = std_value
    tb['Unique'] = num_unique
    tb['Zero Values'] = (df == 0.00).astype(int).sum(axis=0)
    tb['Missing Values'] = df.isnull().sum()
    tb['% Missing Values'] = round(100 * df.isnull().sum() / len(df),3)
    tb['mode'] = mode_value
    tb = tb.sort_values('Type')        
    return tb;

"""
Country label
"""
def country(lat,long):
    g = geocoder.osm([lat, long], method='reverse',short_name=False)
#     try:
#         get = g.json['country_code']
#     except KeyError:
#         get = g.json['country_code']    
    return g.country_code;   

locations = list(zip(data_info['latitude'], data_info['longitude']))

# country_list = []
# for index, tuple in enumerate(locations):
#     country_list.append(country(tuple[0], tuple[1]))

# dump(country_list,'country_list.sav')
country_list = load('country_list.sav')
data_info = pd.concat([data_info,pd.DataFrame(country_list,columns=['country'])],axis = 1)

"""
Remove missing/transform observations 
"""
# data_info_no_na = data_info.dropna()
# data_pricing_no_na = data_pricing.dropna()
# data_info_no_na["stars"] = data_info_no_na["stars"].astype("int")
# data_info_no_na["stars"] = data_info_no_na["stars"].astype("object")
# data_info = data_info.fillna(0)
# print("data_info_no_na = ",data_info_no_na.shape)
# print("data_pricing_no_na = ",data_pricing_no_na.shape)

"""
DF
"""
df = data_info.merge(data_pricing, how = 'inner',on ='our_hotel_id')
df['is_sold_out'] = np.where(df['is_sold_out'] == True,1,0)
df['arrival_date']= pd.to_datetime(df['arrival_date'],format='%Y/%m/%d').dt.normalize()
print('df = ',df.shape)

In [None]:
"""
Simple 
"""
# ax = sns.boxplot(x="country", y="room_count", data=df)
# ax = sns.barplot(x="country", y="price_value_ref", data=df)


In [None]:
"""
Complex
"""
# ax = sns.boxplot(x="stars", y="room_count",hue="country", data=data_info_no_na)
# ax = sns.barplot(x="meal_type_included", y="is_sold_out",hue="country", data=df)
# sns.scatterplot(data=df, x="lead_time", y="is_sold_out", hue="country");

In [None]:
"""
Correlations
"""
# corr = data_pricing_no_na[['max_persons','price_value_ref','price_value_non_ref']].corr()# plot the heatmap
# ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))
# ax.set_yticklabels(
#     ax.get_yticklabels(),
#     rotation=0
# )

# ax.set_xticklabels(
#     ax.get_xticklabels(),
#     rotation=45,
#     horizontalalignment='right'
# );

In [None]:

latitude = 50.855711
longitude = 4.359679

icon_create_function = """\
function(cluster) {
    return L.divIcon({
    html: '<b>' + cluster.getChildCount() + '</b>',
    className: 'marker-cluster marker-cluster-large',
    iconSize: new L.Point(20, 20)
    });
}"""

map_eu = folium.Map(location=[latitude, longitude], zoom_start=7,tiles='cartodbpositron')

locations = list(zip(data_info['latitude'], data_info['longitude']))
popups = ["lon:{}<br>lat:{}".format(lon, lat) for (lat, lon) in locations]


marker_cluster = MarkerCluster(
    locations=locations,
    popups=popups,
    name="1000 clustered icons",
    overlay=True,
    control=True,
    icon_create_function=icon_create_function,
)

marker_cluster.add_to(map_eu)

folium.LayerControl().add_to(map_eu)
