In [None]:
!pip install pandas
!pip install geopandas
!pip install matplotlib
!pip install plotly
!pip install sklearn

In [1]:
import pandas as pd
import geopandas
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import ipywidgets as widgets
from ipywidgets import interact

In [3]:
import plotly.express as px

In [4]:
from IPython.display import HTML
   
HTML('''<script>
code_show=true;
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')


## First,import the dataframe with all the features
- Simplify the columns
- Imput the NaNs

In [5]:
df = pd.read_csv('./data/rent_airbnb_all_features.csv')

#### Simplify the columns

In [6]:
def sum_columns_and_drop(df, columns, new_column, position):
    # convert the list of column names into a numeric range
    cols_range = []
    for col in columns:
        index_no = df.columns.get_loc(col)
        cols_range.append(index_no)
    cols_range = np.array(cols_range)
    
    # insert the new colum with the sum of all given columns
    df.insert(position, new_column, df.iloc[:, cols_range].sum(axis=1))
    # drop the old columns and return the dataframe
    return df.drop(columns=columns)

In [7]:
df = sum_columns_and_drop(df, ['2 Spanish', '3 Spanish', '4 espanyols o més'], 'more than 2 Spanish', 13)
df = sum_columns_and_drop(df, ['2 foreigners', '3 foreigners', 'more than 4 foreigners'], 'more than 2 foreigners', 13)
df = sum_columns_and_drop(df, ['Up to 6 months', 'From 6 months to 12 motnhs'], 'short-term unemployment', 9)
df = df.rename(columns={'Over 12 months':'long-term unemployment'})
df = sum_columns_and_drop(df, ['man_without education', 'woman_without education', 'woman_unknown', 'man_unknown'], 'no education', 14)
df = sum_columns_and_drop(df, ['man_elementary_school', 'man_junior_high_school', 'man_senior_high_school',
                               'woman_elementary_school', 'woman_junior_high_school', 'woman_senior_high_school'], 'medium education', 15)
df = sum_columns_and_drop(df, ['man_bachelors_degree', 'woman_bachelors_degree'], 'bachelors degree', 16)
df.columns

Index(['year', 'quarter', 'district_code', 'district_name',
       'neighbourhood_code', 'neighbourhood_name', 'price_month(€)',
       'price_m2_month(€)', 'short-term unemployment',
       'long-term unemployment', '1 Spanish', '1 foreigner',
       'more than 2 foreigners', 'more than 2 Spanish', 'no education',
       'medium education', 'bachelors degree', 'airbnb_price(€)',
       'n_airbnb_listings', 'availability_365', 'airbnb_ratio_business',
       'airbnb_professional_ratio', 'airbnb_mega_host_ratio'],
      dtype='object')

## Import the population dataset

In [8]:
pop_df = pd.read_csv('./data/cleaned/population/population.csv')
print(pop_df.shape)
pop_df.head()

(439, 6)


Unnamed: 0,year,district_code,district_name,neighbourhood_code,neighbourhood_name,population
0,2015,9,Sant Andreu,58,Baró de Viver,2482
1,2015,7,Horta-Guinardó,34,Can Baró,8938
2,2015,8,Nou Barris,47,Can Peguera,2267
3,2015,8,Nou Barris,49,Canyelles,6946
4,2015,8,Nou Barris,55,Ciutat Meridiana,10156


#### Merge with the original dataset

In [9]:
df = df.merge(pop_df[['year', 'neighbourhood_name', 'population']], on=('neighbourhood_name', 'year'), how='left')
df.head()

Unnamed: 0,year,quarter,district_code,district_name,neighbourhood_code,neighbourhood_name,price_month(€),price_m2_month(€),short-term unemployment,long-term unemployment,...,no education,medium education,bachelors degree,airbnb_price(€),n_airbnb_listings,availability_365,airbnb_ratio_business,airbnb_professional_ratio,airbnb_mega_host_ratio,population
0,2015.0,2015-01-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",697.16,11.51,11574.0,8425.0,...,667.0,11846.0,7482.0,,,,,,,22305
1,2015.0,2015-04-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",708.03,12.02,11574.0,8425.0,...,667.0,11846.0,7482.0,73.3433,903.0,262.921373,0.631229,0.128461,0.016611,22305
2,2015.0,2015-07-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",729.0,14.0,11574.0,8425.0,...,667.0,11846.0,7482.0,75.124776,1114.0,259.641831,0.60772,0.106822,0.002693,22305
3,2015.0,2015-10-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",751.6,13.43,11574.0,8425.0,...,667.0,11846.0,7482.0,72.410441,1111.0,271.629163,0.580558,0.067507,0.0018,22305
4,2016.0,2016-01-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",759.16,12.94,10754.0,7550.0,...,752.0,11759.0,7573.0,72.739919,992.0,260.639113,0.561492,0.073589,0.0,22380


## Some transformations for the map:
- Normalize by the population of the neighbourhood
- Imput the NaNs

#### Normalize by the population of the neighbourhood

In [10]:
df_map = df.copy()

population_features = ['short-term unemployment', 'long-term unemployment','1 Spanish', '1 foreigner',
                       'more than 2 foreigners', 'more than 2 Spanish', 'no education', 
                       'medium education', 'bachelors degree']

def norm_pop(row):
    return row[population_features] / row['population']
    
df_map[population_features] = df.apply(norm_pop, axis=1)
df_map.head()

Unnamed: 0,year,quarter,district_code,district_name,neighbourhood_code,neighbourhood_name,price_month(€),price_m2_month(€),short-term unemployment,long-term unemployment,...,no education,medium education,bachelors degree,airbnb_price(€),n_airbnb_listings,availability_365,airbnb_ratio_business,airbnb_professional_ratio,airbnb_mega_host_ratio,population
0,2015.0,2015-01-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",697.16,11.51,0.518897,0.377718,...,0.029904,0.531092,0.33544,,,,,,,22305
1,2015.0,2015-04-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",708.03,12.02,0.518897,0.377718,...,0.029904,0.531092,0.33544,73.3433,903.0,262.921373,0.631229,0.128461,0.016611,22305
2,2015.0,2015-07-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",729.0,14.0,0.518897,0.377718,...,0.029904,0.531092,0.33544,75.124776,1114.0,259.641831,0.60772,0.106822,0.002693,22305
3,2015.0,2015-10-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",751.6,13.43,0.518897,0.377718,...,0.029904,0.531092,0.33544,72.410441,1111.0,271.629163,0.580558,0.067507,0.0018,22305
4,2016.0,2016-01-01,1.0,Ciutat Vella,4.0,"Sant Pere, Santa Caterina i la Ribera",759.16,12.94,0.480518,0.337355,...,0.033601,0.525424,0.338382,72.739919,992.0,260.639113,0.561492,0.073589,0.0,22380


#### Imput the NaNs for the map

In [11]:
# from sklearn.experimental import enable_iterative_imputer  # noqa
# from sklearn.impute import IterativeImputer

# X = df.drop(columns=['quarter', 'district_name', 'neighbourhood_name'], axis=1)
# imp_iter = IterativeImputer(random_state=42)
# imp_iter.fit(X[X.notna().all(axis=1)])

# df_map = pd.DataFrame(imp_iter.transform(X), index=X.index, columns=X.columns)
# df_map = df_map.assign(quarter=df['quarter'].values, 
#                district_name=df['district_name'].values, 
#                neighbourhood_name=df['neighbourhood_name'].values)

from sklearn.impute import KNNImputer

X = df_map.drop(columns=['quarter', 'district_name', 'neighbourhood_name'], axis=1)
imputer = KNNImputer(n_neighbors=3)
imputer.fit(X[X.notna().all(axis=1)])

df_map = pd.DataFrame(imputer.transform(X), index=X.index, columns=X.columns)
df_map = df_map.assign(quarter=df['quarter'].values, 
                         district_name=df['district_name'].values, 
                         neighbourhood_name=df['neighbourhood_name'].values)


## Then, import the geojson for Barcelona

In [12]:
# load the geojson for Barcelona
barcelona = geopandas.read_file("./data/geojson/neighborhoods.geojson")
print(barcelona.shape)
print(barcelona.columns)
barcelona.drop(barcelona.columns.difference(['NOM', 'geometry']), axis=1, inplace=True)
barcelona.head()

(73, 47)
Index(['ID_ANNEX', 'ANNEXDESCR', 'ID_TEMA', 'TEMA_DESCR', 'ID_CONJUNT',
       'CONJ_DESCR', 'ID_SUBCONJ', 'SCONJ_DESC', 'ID_ELEMENT', 'ELEM_DESCR',
       'NIVELL', 'NDESCR_CA', 'NDESCR_ES', 'NDESCR_EN', 'TERME', 'DISTRICTE',
       'BARRI', 'AEB', 'SEC_CENS', 'GRANBARRI', 'ZUA', 'AREA_I', 'LITERAL',
       'PERIMETRE', 'AREA', 'ORD_REPRES', 'CODI_UA', 'TIPUS_UA', 'NOM', 'WEB1',
       'WEB2', 'WEB3', 'DOCUMENTA', 'RANGESCALA', 'TIPUS_POL', 'GRUIX_ID',
       'GRUIXDIMEN', 'ESTIL_ID', 'ESTIL_QGIS', 'VALOR1QGIS', 'VALOR2QGIS',
       'COL_FARCIT', 'FCOL_DESCR', 'FHEX_COLOR', 'COL_DESCR', 'HEX_COLOR7',
       'geometry'],
      dtype='object')


Unnamed: 0,NOM,geometry
0,el Raval,"POLYGON ((2.16471 41.38593, 2.16936 41.38554, ..."
1,el Barri Gòtic,"POLYGON ((2.17701 41.38525, 2.17873 41.38396, ..."
2,la Barceloneta,"POLYGON ((2.19623 41.38745, 2.19631 41.38745, ..."
3,la Dreta de l'Eixample,"POLYGON ((2.17091 41.40182, 2.17221 41.40083, ..."
4,l'Antiga Esquerra de l'Eixample,"POLYGON ((2.15736 41.39331, 2.15847 41.39245, ..."


#### A neighbourhood in the geojson (el Poble-sec) has a different name in the dataframe (el Poble Sec):

In [13]:
df[df['neighbourhood_name'] == 'el Poble Sec'].head(1)

Unnamed: 0,year,quarter,district_code,district_name,neighbourhood_code,neighbourhood_name,price_month(€),price_m2_month(€),short-term unemployment,long-term unemployment,...,no education,medium education,bachelors degree,airbnb_price(€),n_airbnb_listings,availability_365,airbnb_ratio_business,airbnb_professional_ratio,airbnb_mega_host_ratio,population
1488,2015.0,2015-01-01,3.0,Sants-Montjuïc,11.0,el Poble Sec,578.76,10.27,21260.0,14091.0,...,1477.0,25493.0,8273.0,,,,,,,40217


In [14]:
barcelona[barcelona['NOM'] == 'el Poble-sec']

Unnamed: 0,NOM,geometry
7,el Poble-sec,"POLYGON ((2.17624 41.37286, 2.17619 41.37280, ..."


Change the name in the barcelona geodataframe to do the merge

In [15]:
barcelona = barcelona.replace('el Poble-sec', 'el Poble Sec')

## Merge the dataframe and the geodataframe

In [16]:
merged = barcelona.merge(df_map, right_on='neighbourhood_name', left_on='NOM', how='right')
merged = merged.drop(columns=['NOM', 'year'])
merged.head(2)

Unnamed: 0,geometry,district_code,neighbourhood_code,price_month(€),price_m2_month(€),short-term unemployment,long-term unemployment,1 Spanish,1 foreigner,more than 2 foreigners,...,airbnb_price(€),n_airbnb_listings,availability_365,airbnb_ratio_business,airbnb_professional_ratio,airbnb_mega_host_ratio,population,quarter,district_name,neighbourhood_name
0,"POLYGON ((2.18345 41.39061, 2.18459 41.38976, ...",1.0,4.0,697.16,11.51,0.518897,0.377718,0.170903,0.094508,0.091504,...,73.626172,1042.666667,264.730789,0.606502,0.10093,0.007035,22305.0,2015-01-01,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera"
1,"POLYGON ((2.18345 41.39061, 2.18459 41.38976, ...",1.0,4.0,708.03,12.02,0.518897,0.377718,0.170903,0.094508,0.091504,...,73.3433,903.0,262.921373,0.631229,0.128461,0.016611,22305.0,2015-04-01,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera"


## Plot the variables for every neighbourhood

In [17]:
districts = df['district_name'].unique().tolist()
variables = df.columns.drop(['year', 'quarter', 'district_code', 'district_name', 'neighbourhood_code', 'neighbourhood_name']).to_list()

@interact
def my_plot(distr = districts, 
            var=variables):
    
    fig = px.line(df[df['district_name'] == distr], x='quarter', y=var, color='neighbourhood_name',
                  template='simple_white')
    fig.show()


interactive(children=(Dropdown(description='distr', options=('Ciutat Vella', 'Eixample', 'Gràcia', 'Horta-Guin…

## Map of the behaviour of variables 

In [18]:
neighbourhoods = df_map['neighbourhood_name'].unique().tolist()
features = df_map.columns.drop(['year', 'quarter', 'district_code', 'district_name', 'neighbourhood_code', 'neighbourhood_name'])
quarters = merged['quarter'].sort_values().unique().tolist()

@interact(quarter=widgets.SelectionSlider(options=quarters, 
                                          value=quarters[0],#merged['quarter'].to_list()[0], 
                                          layout=widgets.Layout(width='600px')
                                         )
         )

def select_features_and_quarter(quarter, feature=features,show_neighb=neighbourhoods):
                                
    quarter_df = merged[merged['quarter'] == quarter].copy()
    
    # fix the min and max values of the range color


    quantiles = df_map[feature].quantile([0.25,0.75])
    iqr = quantiles[0.75] - quantiles[0.25]
    factor = 0.3
    if feature == 'n_airbnb_listings':
        factor = 5
    elif feature == 'price_month(€)':
        factor = 0.2
    elif feature == 'mega_host_ratio':
        factor = 2
    elif feature in ['more than 2 foreigners', 'more than 2 Spanish', '1 foreigner', '1 Spanish']:
        factor = 3
    lim_outliers_min = quantiles[0.25] - factor*iqr
    lim_outliers_max = quantiles[0.75] + factor*iqr
    vmin = df_map[feature].min()
    vmax = df_map[feature].max()
        
    vmin = vmin if vmin > lim_outliers_min else lim_outliers_min
    vmax = vmax if vmax < lim_outliers_max else lim_outliers_max
    
    fig, ax = plt.subplots(figsize=(10,10))
    ax.set_aspect('equal')
    quarter_df.plot(column=feature, legend=True, ax=ax, vmin=vmin, vmax=vmax, cmap='coolwarm') 
    plt.title(f'{feature} in Barcelona ({quarter})')
    
    # label the chosen neighbourhood
    quarter_df['coords'] = quarter_df['geometry'].apply(lambda x: x.representative_point().coords[:])
    quarter_df['coords'] = [coords[0] for coords in quarter_df['coords']]
    
    for idx, row in quarter_df.iterrows():
        if row['neighbourhood_name'] == show_neighb:
#         if row['neighbourhood_name'] in ["la Dreta de l'Eixample", 'el Barri Gòtic', 
#                                          'la Sagrada Família', 'Pedralbes', '']:
            plt.annotate(text=row['neighbourhood_name'], xy=row['coords'], color='k', fontsize='large',fontweight='normal',
                 horizontalalignment='center')

interactive(children=(SelectionSlider(description='quarter', layout=Layout(width='600px'), options=('2015-01-0…