# Bayesian Network: AirBnb houses in Bologna
## Importing libraries

In [73]:
import pandas as pd
import numpy as np
import pgmpy

print('Libraries imported')

Libraries imported


## Loading Airbnb database

In [74]:
listings = pd.read_csv("listings.csv")
listings.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,42196,https://www.airbnb.com/rooms/42196,20211217003510,2021-12-17,50 sm Studio in the historic centre,<b>The space</b><br />Really cozy and typical ...,,https://a0.muscache.com/pictures/4775726/68044...,184487,https://www.airbnb.com/users/show/184487,...,4.85,4.85,4.7,,t,1,1,0,0,1.32
1,46352,https://www.airbnb.com/rooms/46352,20211217003510,2021-12-17,A room in Pasolini's house,"Simple, cozy and silent room in a lived house ...",In the very nearby you have Via Saragozza whic...,https://a0.muscache.com/pictures/9f94acc3-98fc...,467810,https://www.airbnb.com/users/show/467810,...,4.79,4.8,4.59,,f,2,0,2,0,2.2
2,59697,https://www.airbnb.com/rooms/59697,20211217003510,2021-12-17,COZY LARGE BEDROOM in the city center,"Cozy, spacious and bright double bedroom, in a...",,https://a0.muscache.com/pictures/ac0528c4-b26f...,286688,https://www.airbnb.com/users/show/286688,...,4.79,4.82,4.66,,f,2,0,2,0,2.18


## Preprocessing
1. Encoding amenities as one-hot array

In [75]:
# listings = listings.join(listings['amenities'].str.get_dummies(sep='", "').astype(bool)[['Kitchen', 'Wifi']])
# listings = listings.rename(columns = {'Kitchen': 'kitchen', 'Wifi': 'wifi'})
# listings.head(3)

listings = pd.concat([listings, listings['amenities'].str.get_dummies(sep='", "').astype(bool)[['Kitchen', 'Wifi']]], axis=1)
listings = listings.rename(columns = {'Kitchen': 'kitchen', 'Wifi': 'wifi'})
listings.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,kitchen,wifi
0,42196,https://www.airbnb.com/rooms/42196,20211217003510,2021-12-17,50 sm Studio in the historic centre,<b>The space</b><br />Really cozy and typical ...,,https://a0.muscache.com/pictures/4775726/68044...,184487,https://www.airbnb.com/users/show/184487,...,4.7,,t,1,1,0,0,1.32,True,False
1,46352,https://www.airbnb.com/rooms/46352,20211217003510,2021-12-17,A room in Pasolini's house,"Simple, cozy and silent room in a lived house ...",In the very nearby you have Via Saragozza whic...,https://a0.muscache.com/pictures/9f94acc3-98fc...,467810,https://www.airbnb.com/users/show/467810,...,4.59,,f,2,0,2,0,2.2,True,False
2,59697,https://www.airbnb.com/rooms/59697,20211217003510,2021-12-17,COZY LARGE BEDROOM in the city center,"Cozy, spacious and bright double bedroom, in a...",,https://a0.muscache.com/pictures/ac0528c4-b26f...,286688,https://www.airbnb.com/users/show/286688,...,4.66,,f,2,0,2,0,2.18,True,False


2. Selecting relevant columns for the Bayesian Network

In [76]:
columns = ['id', 'neighbourhood_cleansed', 'room_type', 'accommodates', 'bathrooms_text', 'price', 'kitchen', 'wifi']
listings_selection = listings.loc[:, columns]
listings_selection.head(3)

Unnamed: 0,id,neighbourhood_cleansed,room_type,accommodates,bathrooms_text,price,kitchen,wifi
0,42196,Santo Stefano,Entire home/apt,2,1 bath,$68.00,True,False
1,46352,Porto - Saragozza,Private room,2,1 shared bath,$29.00,True,False
2,59697,Santo Stefano,Private room,2,1 shared bath,$50.00,True,False


3. Parsing price into float

In [77]:
listings_selection.loc[:, 'price'] = listings_selection.loc[:, 'price'].str.replace(r'[$|,]', '', regex=True).astype(float)
listings_selection.head(3)

Unnamed: 0,id,neighbourhood_cleansed,room_type,accommodates,bathrooms_text,price,kitchen,wifi
0,42196,Santo Stefano,Entire home/apt,2,1 bath,68.0,True,False
1,46352,Porto - Saragozza,Private room,2,1 shared bath,29.0,True,False
2,59697,Santo Stefano,Private room,2,1 shared bath,50.0,True,False


5. Encoding shared bathroom into boolean

In [78]:
listings_selection.loc[:, 'bathrooms_text'] = listings_selection.loc[:, 'bathrooms_text'].str.contains('shared')
listings_selection = listings_selection.rename(columns = {'bathrooms_text': 'shared_bathroom'})
listings_selection.head(3)

Unnamed: 0,id,neighbourhood_cleansed,room_type,accommodates,shared_bathroom,price,kitchen,wifi
0,42196,Santo Stefano,Entire home/apt,2,False,68.0,True,False
1,46352,Porto - Saragozza,Private room,2,True,29.0,True,False
2,59697,Santo Stefano,Private room,2,True,50.0,True,False


6. Discretizing price into 4 categories: '0-50', '50-150', '150-500', '>500'

In [79]:
listings_selection['price'], bins = pd.cut(listings_selection['price'], [0, 50, 150, 500, 10000], labels=['0-50', '50-150', '150-500', '>500'], retbins=True)
listings_selection.head(10)


Unnamed: 0,id,neighbourhood_cleansed,room_type,accommodates,shared_bathroom,price,kitchen,wifi
0,42196,Santo Stefano,Entire home/apt,2,False,50-150,True,False
1,46352,Porto - Saragozza,Private room,2,True,0-50,True,False
2,59697,Santo Stefano,Private room,2,True,0-50,True,False
3,85368,Santo Stefano,Entire home/apt,2,False,50-150,True,True
4,145779,Porto - Saragozza,Private room,1,True,0-50,False,False
5,209692,Navile,Private room,1,True,0-50,True,False
6,219878,Santo Stefano,Entire home/apt,2,False,50-150,True,True
7,229114,Navile,Entire home/apt,4,False,50-150,True,False
8,233922,Porto - Saragozza,Entire home/apt,5,False,50-150,True,True
9,246747,Porto - Saragozza,Private room,2,True,50-150,False,True


In [80]:
listings_selection.groupby(by='price').count()

Unnamed: 0_level_0,id,neighbourhood_cleansed,room_type,accommodates,shared_bathroom,kitchen,wifi
price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0-50,813,813,813,813,813,813,813
50-150,2242,2242,2242,2242,2241,2242,2242
150-500,318,318,318,318,314,318,318
>500,80,80,80,80,80,80,80


7. Counting NaN values and removing them

In [103]:
print(f'Number of rows before cleaning: {listings_selection.shape[0]}')
listings_clean = listings_selection.dropna()
print(f'Number of rows after cleaning: {listings_clean.shape[0]}')

Number of rows before cleaning: 3453
Number of rows after cleaning: 3448


8. Storing unique values and cardinalities

In [118]:
neighbourhood_v = listings_clean['neighbourhood_cleansed'].unique()
neighbourhood_c = len(neighbourhood_v)
print(f'Neighbourhood cardinality: {neighbourhood_c}\nNeighbourhood values: {neighbourhood_v}\n')

room_type_v = listings_clean['room_type'].unique()
room_type_c = len(room_type_v)
print(f'Room type cardinality: {room_type_c}\nRoom Type values: {room_type_v}\n')

accommodates_v = listings_clean['accommodates'].unique()
accommodates_c = len(accommodates_v)
print(f'Accommodates cardinality: {accommodates_c}\nAccommodates values: {accommodates_v}\n')

shared_bathroom_v = [True, False]
shared_bathroom_c = 2
print(f'Shared bathroom cardinality: {shared_bathroom_c}\nShared bathroom values: {shared_bathroom_v}\n')

kitchen_v = [True, False]
kitchen_c = 2
print(f'Kitchen cardinality: {kitchen_c}\nKitchen values: {kitchen_v}\n')

wifi_v = [True, False]
wifi_c = 2
print(f'Wifi cardinality: {wifi_c}\nWifi values: {wifi_v}\n')

price_v = ['0-50', '50-150', '150-500', '>500']
price_c = 4
print(f'Price cardinality: {price_c}\nPrice values: {price_v}\n')

Neighbourhood cardinality: 6
Neighbourhood values: ['Santo Stefano' 'Porto - Saragozza' 'Navile' 'San Donato - San Vitale'
 'Savena' 'Borgo Panigale - Reno']

Room type cardinality: 4
Room Type values: ['Entire home/apt' 'Private room' 'Hotel room' 'Shared room']

Accommodates cardinality: 16
Accommodates values: [ 2  1  4  5  3  6  9  8  7 10 13 16 12 11 15 14]

Shared bathroom cardinality: 2
Shared bathroom values: [True, False]

Kitchen cardinality: 2
Kitchen values: [True, False]

Wifi cardinality: 2
Wifi values: [True, False]

Price cardinality: 4
Price values: ['0-50', '50-150', '150-500', '>500']



## Probabilities calculation
1. Splitting dataset in train and test