In [None]:
# import modules/libraries
import warnings 
warnings.simplefilter(action='ignore')
import osmnx as ox
import pandas as pd
import numpy as np
import geopandas as gpd
import time
from scipy import stats
import itertools
import os
import pickle
import geojson
from sqlalchemy import create_engine
import re
import sqlite3
from pathlib import Path
from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon
import chardet
from scipy import spatial
from scipy.spatial import KDTree
cwd = Path().resolve()

# visualisation
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib as mpl 
%matplotlib inline 
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [None]:
# import the airbnb data
df = pd.read_csv(os.path.join(Path(cwd).parent, 'data', 'listings.csv.gz'), encoding='utf-8')
df.drop(['listing_url', 'host_picture_url', 'host_verifications', 'host_thumbnail_url', 'host_about', 'neighborhood_overview', 'picture_url', 'scrape_id', 'neighbourhood_group_cleansed', 'calculated_host_listings_count_shared_rooms', 'calculated_host_listings_count_private_rooms','calculated_host_listings_count_entire_homes'], axis=1, inplace=True)
df = df[['id', 'name','description', 'host_name','host_since', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost','host_listings_count','host_total_listings_count', 'host_has_profile_pic','host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities','price']]
df['neighbourhood'] = df['neighbourhood_cleansed']
df.drop(['neighbourhood_cleansed'], axis=1, inplace=True)

In [None]:
def aggregate_data(df, group='', agge='', rename=''):
    """ function to group, aggregate and rename the dataframe """
    df = df.groupby([group]).agg(agge)
    df.columns = df.columns.droplevel(0)
    df.columns = rename
    df.reset_index(drop=True, inplace=True)
    return df


def tukey_rule(data_frame, column_name):
    """ apply tukey rule to remove outliers """
    data = data_frame[column_name]
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    max_value = Q3 + 1.5 * IQR
    min_value = Q1 - 1.5 * IQR

    return data_frame[(data_frame[column_name] < max_value) & (data_frame[column_name] > min_value)]

    
def get_price(price_string):
    """ convert the price string into float """
    try:
        price_string = price_string.replace(' ', '')
        pattern = re.compile(r'\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})?')
        return float(pattern.findall(price_string)[0].replace(',',''))
    except:
        pass #print(price_string)

def remove_pct(rate):
    """ """
    try:
        return int(rate.replace('%', ''))
    except:
        return np.nan
        
        
def convert_dtype(df, cols_cur):
    """convert price & adjusted price dtype from object to float (without $) """
    for col in cols_cur:
        df[col] = df[col].str.replace('$','')
        df[col] = df[col].str.replace(',', '').astype(float)


df['host_response_rate'] = df.apply(lambda x: remove_pct(x['host_response_rate']), axis=1)
df['host_acceptance_rate'] = df.apply(lambda x: remove_pct(x['host_acceptance_rate']), axis=1)
df['id'] = df['id'].astype('category')
pd.to_numeric(df['host_response_rate'])
pd.to_numeric(df['host_acceptance_rate'])
df['price'] = df.apply(lambda x: get_price(x['price']), axis=1)

df['neighbourhood'] = df['neighbourhood'].str.replace('Landstra§e', 'Landstraße')
df['neighbourhood'] = df['neighbourhood'].str.replace('Rudolfsheim-Fnfhaus', 'Rudolfsheim-Fünfhaus')
df['neighbourhood'] = df['neighbourhood'].str.replace('Dbling', 'Döbling')
df['neighbourhood'] = df['neighbourhood'].str.replace('Whring', 'Währing')


# set data types
df['host_since'] = pd.to_datetime(df['host_since'])
df['host_for'] = (pd.to_datetime('2022-11-05')-df['host_since']).dt.days# / pd.Timedelta(hours=1) #.astype('timedelta64[h]')
pd.to_numeric(df['host_for'])
df_orig = df.copy()
for column in ['price']:
    df = tukey_rule(df, column)

In [None]:
def get_geo_data():
    """ load geojson data """
    with open(os.path.join(Path(cwd).parent, 'data', 'geojson', 'vienna.geojson'), encoding='utf-8') as fp:
        counties = geojson.load(fp)
    return counties

In [None]:

def bar_airbnb(df):
    """generates the bar chart of the category distribution from the "direct" genre """
    agg = df.groupby('neighbourhood').agg(nr_listings = ('id', 'count')).reset_index().sort_values('nr_listings', ascending=False)
    agg['ratio'] = 100 * agg['nr_listings'] / agg['nr_listings'].sum()
    fig = px.bar(x=agg['neighbourhood'].tolist(), y=agg['ratio'])
    fig.update_traces(marker_line_color='#9c9c9c', marker_line_width=1, opacity=0.7)

    fig.update_layout(xaxis={'visible': True, 'showticklabels': True})
    fig.update_layout(yaxis={'visible': True, 'showticklabels': True})
    fig.update_yaxes(title='Listings in %', tickfont=dict(family='Helvetica', color='#9c9c9c'),
                     title_font_color='#9c9c9c', mirror=True,
                     ticks='outside', showline=True, gridwidth=1, gridcolor='#4c4c4c')
    fig.update_xaxes(tickfont=dict(family='Helvetica', color='#9c9c9c'),
                     title_font_color='#9c9c9c', mirror=True,
                     ticks='outside', showline=True, gridwidth=1, gridcolor='#4c4c4c')
    fig.update_layout(font=dict(family="Helvetica"))
    fig.update_layout(xaxis_title=None)
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.update_layout(autosize=False,width=800,height=400)
    fig.show()

In [None]:
def heatmap_airbnb2_save(title=''):
    """ """
    k = aggregate_data(df, 'neighbourhood', {'neighbourhood':['first'], 'price':['median'], 'host_is_superhost': ['first']},\
                       rename=['district', 'median', 'host_is_superhost'])
    k['median'] = k['median'].astype('category')
    k.sort_values(by='median', ascending = False, inplace=True)
    fig = px.choropleth_mapbox(k, geojson=districts, locations=k['district'], featureidkey="properties.name", 
                               color=k['median'],
                               title=title,
                               color_discrete_sequence=px.colors.qualitative.Prism, 
                               labels={'median':'price per night'},
        mapbox_style="open-street-map", zoom=10, center={"lat": 48.210033, "lon": 16.363449}, opacity=0.60)
    
    fig.add_scattermapbox(
        lat=df['latitude'].tolist(),
        lon=df['longitude'].tolist(),
        mode='markers',
        showlegend=False,
        #text=texts,
        marker_size=5,
        marker_color='#F3B5B6',
        opacity= 0.5,
        hoverinfo='skip'
    )
    ### https://nbviewer.org/github/etpinard/plotly-misc-nbs/blob/master/geo-maps/discrete-choropleths.ipynb#Suggestion-2:-split-data-into-several-choropleth-traces
    """
    dfz = k.groupby(['median']).agg({'median':['first']})
    dfz.columns = dfz.columns.droplevel(0)
    dfz.columns = ['price']
    dfz.reset_index(drop=True, inplace=True)
    dfz.sort_values(by='price', ascending = False, inplace=True)
    tmp = dfz['price'].tolist()
    for i,trace in enumerate (fig.data):
        trace.update(name=f"{tmp[i]} $")
        """
    fig.update_layout(font=dict(family="Helvetica"))
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.update_layout(autosize=False,width=700,height=500)
    fig.show()

In [None]:
#bar_airbnb(df)

In [None]:
print(px.colors.cyclical.IceFire)

In [None]:
districts = get_geo_data()

In [None]:
def heatmap_airbnb(title=''):
    """
    https://plotly.com/python/builtin-colorscales/
    """
    agg = df.groupby('neighbourhood').agg(nr_listings = ('id', 'count')).reset_index().sort_values('nr_listings', ascending=False)
    agg['ratio'] = 100 * agg['nr_listings'] / agg['nr_listings'].sum()
    agg['nr_listings'] = agg['nr_listings'].astype('category')
    agg.sort_values(by='nr_listings', ascending = False, inplace=True)
    fig = px.choropleth_mapbox(agg, geojson=districts, locations=agg['neighbourhood'], featureidkey="properties.name",
                               color_discrete_sequence=px.colors.cyclical.IceFire, #px.colors.sequential.Plasma_r, #px.colors.qualitative.Dark24,
                               color=agg['nr_listings'],
                               #color=agg['ratio'],
                               title=title,
                               labels={'nr_listings':'Nr. of listings'},
        mapbox_style="open-street-map", zoom=10, center = {"lat": 48.210033, "lon": 16.363449}, opacity=0.40)
    neighbourhood = agg['neighbourhood'].tolist()
    nr = agg['nr_listings'].tolist()
    for i,trace in enumerate (fig.data):
        trace.update(name=f'{nr[i]} / {neighbourhood[i]}')
    #fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))
    fig.update_layout(font=dict(family="Helvetica"))
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.update_layout(autosize=False,width=700,height=500)
    fig.show()

#heatmap_airbnb()

In [None]:
k = aggregate_data(df, 'neighbourhood', {'neighbourhood':['first'], 'price':['median'], 'host_is_superhost': ['first']},\
                   rename=['district', 'median', 'host_is_superhost'])
k['median'] = k['median'].astype('category')
k.sort_values(by='median', ascending = False, inplace=True)
k.head()

In [None]:
k = aggregate_data(df, 'neighbourhood', {'neighbourhood':['first'], 'price':['median'], 'host_is_superhost': ['first']},\
                   rename=['district', 'median', 'host_is_superhost'])
k['median'] = k['median'].astype('category')

k.head(12)
j=k.groupby(by=["median"], dropna=False).sum()
j.head(25)

dfz = k.groupby(['median']).agg({'median':['first']})
dfz.columns = dfz.columns.droplevel(0)
dfz.columns = ['price']
dfz.reset_index(drop=True, inplace=True)
dfz.sort_values(by='price', ascending = False, inplace=True)
dfz.head(25)  
dfz = dfz['price'].tolist()

In [None]:
farbe = px.colors.cyclical.IceFire[0:len(dfz)] # px.colors.diverging.RdBu[0:len(dfz)]
print(px.colors.cyclical.IceFire)
print(farbe)
print(dfz)

In [None]:
dicts = {}
for i,j in zip(dfz, farbe):
     dicts[i] = f'{i} $'
print(dicts)

In [None]:
def heatmap_airbnb2(title=''):
    """
    interesting: https://stackoverflow.com/questions/71104827/plotly-express-choropleth-map-custom-color-continuous-scale
    """
    k = aggregate_data(df, 'neighbourhood', {'neighbourhood':['first'], 'price':['median']},\
                       rename=['district', 'median'])
    k['median'] = k['median'].astype('category')
    k.sort_values(by='median', ascending = False, inplace=True)

    farbe = px.colors.cyclical.IceFire[0:len(dfz)]
    cols = k['median'].map(dicts)

    fig = px.choropleth_mapbox(k, geojson=districts, locations=k['district'], featureidkey="properties.name", 
                               color=cols,
                               title=title,
                               color_discrete_sequence=farbe,
                               #color_discrete_sequence=px.colors.qualitative.Prism, 
                               labels={'median':'price per night'},
        mapbox_style="open-street-map", zoom=10, center={"lat": 48.210033, "lon": 16.363449}, opacity=0.60)
    """
    fig.add_scattermapbox(
        lat=df['latitude'].tolist(),
        lon=df['longitude'].tolist(),
        mode='markers',
        showlegend=False,
        #text=texts,
        marker_size=5,
        marker_color='#F3B5B6',
        opacity= 0.5,
        hoverinfo='skip'
    )
    ### https://nbviewer.org/github/etpinard/plotly-misc-nbs/blob/master/geo-maps/discrete-choropleths.ipynb#Suggestion-2:-split-data-into-several-choropleth-traces
    
    dfz = k.groupby(['median']).agg({'median':['first']})
    dfz.columns = dfz.columns.droplevel(0)
    dfz.columns = ['price']
    dfz.reset_index(drop=True, inplace=True)
    dfz.sort_values(by='price', ascending = False, inplace=True)
    tmp = dfz['price'].tolist()
    for i,trace in enumerate (fig.data):
        trace.update(name=f"{tmp[i]} $")
        """
    fig.update_layout(legend={"title":"price per night"})
    fig.update_layout(font=dict(family="Helvetica"))
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.update_layout(autosize=False,width=700,height=500)
    fig.show()
#heatmap_airbnb2()

In [None]:
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "none"


N = 15
corr = [[-0.3+ 1.15 * np.random.random() if i>j else None for j in range(N)] for i in range (N)]

labels = 'ABCDEFGHIJKLMNO' 
X = [labels[k] for k in range(N)]
hovertext = [[f'corr({X[i]}, {X[j]})= {corr[i][j]:.2f}' if i>j else '' for j in range(N)] for i in range(N)]
sns_colorscale = [[0.0, '#3f7f93'], #cmap = sns.diverging_palette(220, 10, as_cmap = True)
 [0.071, '#5890a1'],
 [0.143, '#72a1b0'],
 [0.214, '#8cb3bf'],
 [0.286, '#a7c5cf'],
 [0.357, '#c0d6dd'],
 [0.429, '#dae8ec'],
 [0.5, '#f2f2f2'],
 [0.571, '#f7d7d9'],
 [0.643, '#f2bcc0'],
 [0.714, '#eda3a9'],
 [0.786, '#e8888f'],
 [0.857, '#e36e76'],
 [0.929, '#de535e'],
 [1.0, '#d93a46']]

heat = go.Heatmap(z=corr,
                  x=X,
                  y=X,
                  xgap=1, ygap=1,
                  colorscale=sns_colorscale,
                  colorbar_thickness=20,
                  colorbar_ticklen=3,
                  hovertext =hovertext,
                  hoverinfo='text'
                   )


title = 'Correlation Matrix'               

layout = go.Layout(title_text=title, title_x=0.5, 
                   width=600, height=600,
                   xaxis_showgrid=False,
                   yaxis_showgrid=False,
                   yaxis_autorange='reversed')
   
#fig=go.Figure(data=[heat], layout=layout)        
#fig.show() 

##### Which are the cheapest and the most expensive districts in Vienna?

In [None]:
df_p = df.copy()
df_p = aggregate_data(df_p, 'neighbourhood', {'neighbourhood':['first', 'count'], 'price':['median']}, rename=['district', 'listings','median'])
df_p = df_p.round({'median': 1})
df_p.sort_values(by=['median'], na_position='first', ascending=False, inplace=True)
df_p.head(5)

In [None]:
boundary_geojson = gpd.read_file(os.path.join(Path(cwd).parent, 'data', 'geojson', 'vienna.geojson'))
boundary_geojson.drop(columns=['cartodb_id', 'created_at', 'updated_at'], inplace=True)
region = boundary_geojson.geometry.unary_union

def get_local_crs(y,x):  
    x = ox.utils_geo.bbox_from_point((y, x), dist = 500, project_utm = True, return_crs = True)
    return x[-1]
  
# Set longitude and latitude of Vienna
lon_latitude = 48.210033
lon_longitude = 16.363449

local_utm_crs = get_local_crs(lon_latitude, lon_longitude)
print(f"boundary data type: {type(boundary_geojson)}, region data type: {type(region)}")

In [None]:
boundary_geojson.head()

In [None]:
#from shapely.geometry import Polygon
#polygon = Polygon([region])
#polygon.head()

In [None]:
polys = gpd.GeoSeries({
    'foo': Polygon([(5, 5), (5, 13), (13, 13), (13, 5)]),
    'bar': Polygon([(10, 10), (10, 15), (15, 15), (15, 10)]),
})
polys.head()

In [None]:
poly = gpd.GeoDataFrame(index=[0],  geometry=[region], crs=get_local_crs(16.363449, 48.210033))
poly.head()

In [None]:
polys2 = gpd.GeoSeries({'within': region})
polys2.head()

In [None]:
[print(geom) for key, geom in polys2.items()]

In [None]:
#  lat =48.210033
#    long = 16.363449
#            df = pd.DataFrame({'Location':[text]})
 #           gdf = gpd.GeoDataFrame(df, geometry=gpd.GeoSeries.from_xy([lon], [lat], crs=self.get_local_utm_crs())
# create point
from shapely.geometry import Point, Polygon
_pnts = [Point(16.363449, 48.210033 )] 
pnts = gpd.GeoDataFrame(geometry=_pnts, index=['Location'], crs=get_local_crs(16.363449, 48.210033))
#pt['within'] = pt.apply(lambda x: x.geometry.within(poly), axis=1)
pnts = pnts.assign(**{key: pnts.within(geom) for key, geom in polys2.items()})
pnts.head()

In [None]:

def check_if_coord_in_poly(_pnts):
    """
    Check if a coordinate (lat,long) is within a given polygon
    source: https://stackoverflow.com/questions/48097742/geopandas-point-in-polygon
    Return: True if Point wihtin Polygon
    """
    polygon = gpd.GeoSeries({'within': region})
    pnts = gpd.GeoDataFrame(geometry=_pnts, index=['Point to check'], crs=get_local_crs(16.363449, 48.210033))
    pnts = pnts.assign(**{key: pnts.within(geom) for key, geom in polygon.items()})
    pnts.head()
    return pnts['within'].item()

lat = 48.210033
long = 16.363449
v = check_if_coord_in_poly([Point(long, lat)])

In [None]:
# pt2 = gpd.GeoDataFrame(df, geometry=gpd.GeoSeries.from_xy([lon], [lat], crs=self.get_local_utm_crs()))

In [None]:
from shapely import wkt

def import_csv_to_gpd(name):
    """ import the csv file a gepandas dataframe """
    df = pd.read_csv(os.path.join(Path(cwd).parent, 'data', 'osm', f'{name}.csv'), sep=",")
    df['geometry'] = df['geometry'].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df, crs='epsg:4326')
    return gdf

restaurant = import_csv_to_gpd('restaurant')
cafe = import_csv_to_gpd('cafe')
attraction = import_csv_to_gpd('attraction')
station = import_csv_to_gpd('attraction')
bar = import_csv_to_gpd('bar')
biergarten = import_csv_to_gpd('biergarten')
fast_food = import_csv_to_gpd('fast_food')
pub = import_csv_to_gpd('pub')
nightclub = import_csv_to_gpd('nightclub')
theatre= import_csv_to_gpd('theatre')
university= import_csv_to_gpd('university')
attraction= import_csv_to_gpd('attraction')

In [None]:
def get_lat_long(point):
    """ get latitude and longitude coordinate from POINT geometry """
    try:
        return pd.Series([point.x, point.y])
    except:
        pass

def geo_coordinates(df):
    """ import from csv in geopandas dataframe
    source: https://stackoverflow.com/questions/61122875/geopandas-how-to-read-a-csv-and-convert-to-a-geopandas-dataframe-with-polygons
    """
    df['geometry'] = df['geometry'].apply(lambda x: x.centroid if type(x) == Polygon else (x.centroid if type(x) == MultiPolygon else x))
    df[['long', 'lat']] = df.apply(lambda x: get_lat_long(x['geometry']), axis=1)
    df = df[df['geometry'].apply(lambda x : x.type=='Point' )]
    df = df.to_crs(local_utm_crs)
    return df

In [None]:
restaurant = geo_coordinates(restaurant)
cafe = geo_coordinates(cafe)
bar = geo_coordinates(bar)
station = geo_coordinates(station)
biergarten = geo_coordinates(biergarten)
fast_food = geo_coordinates(fast_food)
pub = geo_coordinates(pub)
nightclub = geo_coordinates(nightclub)
theatre = geo_coordinates(theatre)
university = geo_coordinates(university)
attraction = geo_coordinates(attraction)

In [None]:
def get_tree(df):
    try:
        # turn long/lats into a list
        coords = list(zip(df.geometry.apply(lambda x: x.y).values,df.geometry.apply(lambda x: x.x).values))
        # create a KDTree
        tree = spatial.KDTree(coords)
        return tree
    except Exception as e:
        print(e)

In [None]:
def find_points_closeby(tree, lat_lon, k = 500, max_distance = 500 ):
    results = tree.query((lat_lon), k = k, distance_upper_bound= max_distance)
    zipped_results = list(zip(results[0], results[1]))
    zipped_results = [i for i in zipped_results if i[0] != np.inf]
    return len(zipped_results)

t0 = time.time()
air_gdf = df.copy()

parameters = [restaurant, cafe , bar, station, biergarten, fast_food, pub, nightclub,theatre,university,attraction]
names = ['restaurant', 'cafe', 'bar', 'station', 'biergarten', 'fast_food', 'pub', 'nightclub','theatre','university','attraction']

air_gdf = gpd.GeoDataFrame(air_gdf, geometry = gpd.points_from_xy(air_gdf.longitude, air_gdf.latitude), crs = 4326)
air_gdf = air_gdf.to_crs(local_utm_crs)

for name, i in zip(names, parameters):
    tree = get_tree(i)
    air_gdf[name] = air_gdf.apply(lambda row: find_points_closeby(tree, (row.geometry.y, row.geometry.x)) , axis = 1)

print (f"Completed in {round(time.time() - t0)} s")

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, make_scorer

def display_results(cv, y_test, y_pred):
    """ check how well the model performs. """
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    print("\nBest Parameters:", cv.best_params_)

def evaluate_model(model, X_test, y_test, category_names):
    """ evaluate how well the given model performs with test data set """
    y_pred = model.predict(X_test)

    class_report = classification_report(y_test, y_pred, target_names=category_names)
    print(class_report)

def save_model(model, model_filepath):
    """ save model as a .pkl file under a give file path """
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)


# Separate the target variable and rest of the variables
X, y = air_gdf[['restaurant','cafe', 'bar', 'station','biergarten','fast_food','pub','nightclub',
                'theatre','university']], air_gdf['price']

# Convert the dataset into an optimized data structure called Dmatrix that XGBoost supports and gives it acclaimed performance and efficiency gains. You will use this later in the tutorial.
data_dmatrix = xgb.DMatrix(data=X,label=y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)
model = xgb.XGBRegressor()

model.fit(X_train,y_train)

save_model(model, os.path.join(Path(cwd).parent, 'model', 'xboost.pkl'))

# feature importance
print(model.feature_importances_)
preds = model.predict(X_test)

# Compute the rmse by invoking the mean_sqaured_error function from sklearn's metrics module.
rmse = np.sqrt(mean_squared_error(y_test, preds))

params = {}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

print("RMSE: %f" % (rmse))
print((cv_results["test-rmse-mean"]).tail(1))

xgb.plot_importance(model)
plt.rcParams['figure.figsize'] = [6, 6]
plt.show()

In [None]:
X_train.head()
df1_transposed = X_train.T
df1_transposed.head()

In [None]:
# predict benchmark price

d = {'restaurant': 3, 'cafe': 100, 'bar':5, 'station':3, 'biergarten':1, 'fast_food':15, 'pub':3,
    'nightclub':1,'theatre':0,'university':0}
X_pred = pd.DataFrame(data=d, index=[0])
preds = model.predict(X_pred)
print(preds)
df1_transposed = X_pred.T
#df1_transposed.reset_index(drop=True, inplace=True)
print(list(df1_transposed.index.values))
print(df1_transposed.columns.tolist())
print(df1_transposed[0].tolist())
df1_transposed.rename({0: 'test'}, inplace=True, axis=1)
df1_transposed['index1'] = df1_transposed.index
df1_transposed.head()

In [None]:
def get_main_chart(df):
    """
    source: https://stackoverflow.com/questions/61693014/how-to-hide-plotly-yaxis-title-in-python
    """
    df1_transposed = X_pred.T
    df1_transposed.rename({0: 'value'}, inplace=True, axis=1)
    df1_transposed['feature'] = df1_transposed.index
    fig = px.bar(df1_transposed, x='value', y='feature', orientation='h')
    fig.update_layout(hovermode=False)
    #fig.update_layout(xaxis={'visible': False, 'showticklabels': True},
    #                  yaxis={'visible': False, 'showticklabels': True})
    fig.update_yaxes(title='', visible=True, showticklabels=True)
    fig.update_xaxes(title='', visible=True, showticklabels=True)
    fig.update_yaxes(tickfont=dict(family='Helvetica', color='#9c9c9c'),
                     title_font_color='#9c9c9c',
                     ticks='outside', showline=True, gridwidth=1, gridcolor='#4c4c4c')
    fig.update_layout(font_family="Helvetica")
    
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)')
    fig.show()
    
get_main_chart(X_pred)

In [None]:
list(df1_transposed.index.values)

In [None]:
fig = go.Figure(go.Scatter(x=[], y=[]))
fig.update_layout(template=None)
fig.update_xaxes(showgrid=False, showticklabels=False, zeroline=False)
fig.update_yaxes(showgrid=False, showticklabels=False, zeroline=False)
fig.show()

In [None]:
# Define color sets of paintings
night_colors = ['rgb(56, 75, 126)', 'rgb(18, 36, 37)', 'rgb(34, 53, 101)']

def get_main_chart(dfx):

    if dfx is None:
        d = {'restaurant': 0, 'cafe': 0, 'bar': 0, 'station': 0, 'biergarten': 0, 'fast_food': 0, 'pub': 0,
             'nightclub': 0, 'theatre': 0, 'university': 0}
        df = pd.DataFrame(data=d, index=[0])
        df1_transposed = df.T
        """ generates the horizontal bar chart with the categories """
    else:
        df1_transposed = dfx.T
    df1_transposed.rename({0: 'value'}, inplace=True, axis=1)
    df1_transposed['feature'] = df1_transposed.index
    fig = px.bar(df1_transposed, x='value', y='feature', orientation='h')
    fig.update_layout(hovermode=False)
    # fig.update_layout(xaxis={'visible': False, 'showticklabels': True},
    #                  yaxis={'visible': False, 'showticklabels': True})
    fig.update_yaxes(title='', visible=True, showticklabels=True)
    fig.update_xaxes(title='', visible=True, showticklabels=True)
    fig.update_yaxes(tickfont=dict(size= 10, family='Helvetica', color='#9c9c9c'),
                     title_font_color='#9c9c9c',
                     ticks='outside', showline=True, gridwidth=1, gridcolor='#4c4c4c')
    fig.update_layout(font_family="Helvetica")
    fig.update_traces(marker_color=night_colors[0], marker_line_color='#9c9c9c', marker_line_width=1, opacity=0.7)
    
    if dfx is None:
        fig.update_xaxes(range=[0, 10])
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)')
    #fig.update_traces(width=0.1)
    fig.update_layout(margin_pad=2)
    fig.update_layout(
    autosize=True,
    #width=800,
    height=300,)
    # fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, paper_bgcolor='rgba(0,0,0,0)')
    return fig
get_main_chart(X_pred)

In [None]:
X_pred.head()