In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
df = pd.read_csv("data/Barcelona/listings-detailed-bcn.csv")
print(df.head())

       id                          listing_url       scrape_id last_scraped  \
0  269467  https://www.airbnb.com/rooms/269467  20240615011618   2024-06-15   
1  270197  https://www.airbnb.com/rooms/270197  20240615011618   2024-06-15   
2   18674   https://www.airbnb.com/rooms/18674  20240615011618   2024-06-15   
3  272282  https://www.airbnb.com/rooms/272282  20240615011618   2024-06-15   
4   23197   https://www.airbnb.com/rooms/23197  20240615011618   2024-06-15   

            source                                               name  \
0      city scrape       Sunny and quiet Loft next to Sagrada Familia   
1      city scrape    *monthly stays* 4 Bedroom Apartment near Rambla   
2      city scrape    Huge flat for 8 people close to Sagrada Familia   
3  previous scrape             CHARMING, VERY SPACIOUS & BEST LOCATED   
4      city scrape  Forum CCIB DeLuxe, Spacious, Large Balcony, relax   

                                         description  \
0  The apartment is a cosy ope

In [4]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [17]:
df.dtypes

id                                                int64
listing_url                                      object
scrape_id                                         int64
last_scraped                                     object
source                                           object
                                                 ...   
calculated_host_listings_count                    int64
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
Length: 75, dtype: object

In [18]:
# Count total IDs in df
df['host_id'].count()

18925

In [19]:
# Count unique IDs in df
df['host_id'].nunique()

7161

In [20]:
# See unique values in room_type column
df['room_type'].unique()

array(['Entire home/apt', 'Private room', 'Hotel room', 'Shared room'],
      dtype=object)

In [21]:
# Count number of each value in room_type column
df["room_type"].value_counts()

room_type
Entire home/apt    11217
Private room        7436
Shared room          151
Hotel room           121
Name: count, dtype: int64

In [22]:
# View the above as percentages of the total
df["room_type"].value_counts(normalize=True) * 100

room_type
Entire home/apt    59.270806
Private room       39.291942
Shared room         0.797886
Hotel room          0.639366
Name: proportion, dtype: float64

In [23]:
# See unique values in neighbourhood column
df['neighbourhood'].unique()

array([nan, 'Barcelona, CT, Spain', 'Barcelona, Catalunya, Spain',
       'Sant Adria de Besos, Barcelona, Spain',
       'Barcelona, Catalonia, Spain', 'Barcelona, Ca, Spain',
       'Barcelona, Spain', 'Barcelona, Cataluña, Catalonia, Spain',
       'Barcelona , Catalonia, Spain', 'Barcelona, Cataluna, Spain',
       'Barcelona, BARCELONA, Spain',
       "L'Hospitalet de Llobregat, Catalunya, Spain",
       'Hospitalet de Llobregat, CT, Spain',
       'Sagrada Familia, Barcelona, Catalonia, Spain',
       'El Masnou, Catalonia, Spain', 'Barcelona, barcelona, Spain',
       'Barcelona, Katalonien, Spain', 'BARCELONA, city, Spain',
       'barcelona, Barcelona, Spain', 'Барселона, Barcelona, Spain',
       'Барселона, Каталония, Spain', 'Spain',
       'Barcelona , Catalunya, Spain', '3-1, Barcelona, Spain',
       'Sant Adrià de Besòs, Catalonia, Spain', 'Bcn, Spain',
       'Barcelona , Barcelona, Spain',
       'Barcelona, Barcelona (España), Spain',
       'Barcelona El RAVAL , Cat

In [24]:
# Count number of each value in neighbourhood column
df["neighbourhood"].value_counts()

neighbourhood
Barcelona, Catalunya, Spain                     8086
Barcelona, Catalonia, Spain                     1167
Barcelona, Spain                                 479
Barcelona, CT, Spain                             182
BARCELONA, city, Spain                            67
Barcelona, Cataluña, Catalonia, Spain             50
Sant Adrià de Besòs, Catalunya, Spain             12
BARCELONA, Spain                                  10
L'Hospitalet de Llobregat, Catalunya, Spain       10
Barcelona, Ca, Spain                               8
Barcelona, ., Spain                                8
Barcelona , Catalunya, Spain                       7
Barcelona , Barcelona, Spain                       6
., Barcelona, Spain                                6
Barcelona, barcelona, Spain                        5
BARCELONA, SPAIN, Spain                            4
Spain                                              3
Barcelona, CATALUNYA, Spain                        3
Eixample, Barcelona, Spain      

In [25]:
#Crear nuevo dataset con menos columnas

# Creating the new DataFrame with the required columns
listings_prepared_bcn = df[['host_id', 'host_name', 'calculated_host_listings_count', 
                            'host_is_superhost', 'listing_url', 'id', 'name', 
                            'neighbourhood_group_cleansed', 'latitude', 'longitude', 
                            'room_type', 'bedrooms', 'price', 'number_of_reviews', 
                            'review_scores_rating', 'license']].copy()

# Adding 'kitchen', 'patio or balcony', 'elevator', 'air conditioning' columns based on 'amenities'
listings_prepared_bcn['kitchen'] = df['amenities'].str.contains('kitchen', case=False, na=False).astype(int)
listings_prepared_bcn['patio or balcony'] = df['amenities'].str.contains('patio|balcony', case=False, na=False).astype(int)
listings_prepared_bcn['elevator'] = df['amenities'].str.contains('elevator', case=False, na=False).astype(int)
listings_prepared_bcn['air conditioning'] = df['amenities'].str.contains('air conditioning', case=False, na=False).astype(int)

# Adding 'long_term', 'short_term', 'possible_long_term' columns based on 'minimum_nights' and 'maximum_nights'
listings_prepared_bcn['long_term'] = (df['minimum_nights'] > 31).astype(int)
listings_prepared_bcn['short_term'] = (df['maximum_nights'] <= 31).astype(int)
listings_prepared_bcn['possible_long_term'] = ((df['minimum_nights'] >= 1) & (df['maximum_nights'] >= 31)).astype(int)

# Display the first few rows of the new dataframe
print(listings_prepared_bcn.head())


   host_id       host_name  calculated_host_listings_count host_is_superhost  \
0  1411775        Jeremies                               3                 f   
1  1414702          Ferran                              27                 f   
2    71615   Mireia  Maria                              27                 f   
3  1425070             Mar                               1                 t   
4    90417  Etain (Marnie)                               1                 f   

                           listing_url      id  \
0  https://www.airbnb.com/rooms/269467  269467   
1  https://www.airbnb.com/rooms/270197  270197   
2   https://www.airbnb.com/rooms/18674   18674   
3  https://www.airbnb.com/rooms/272282  272282   
4   https://www.airbnb.com/rooms/23197   23197   

                                                name  \
0       Sunny and quiet Loft next to Sagrada Familia   
1    *monthly stays* 4 Bedroom Apartment near Rambla   
2    Huge flat for 8 people close to Sagrada Famil

In [26]:
# Export the DataFrame to an Excel file
listings_prepared_bcn.to_excel('listings_prepared_bcn.xlsx', index=False)

In [27]:
# Reordering the columns
ordered_columns = [
    'id', 'listing_url', 'name', 'host_id', 'host_name', 'calculated_host_listings_count', 
    'host_is_superhost', 'license', 'neighbourhood_group_cleansed', 'latitude', 
    'longitude', 'room_type', 'bedrooms', 'price', 'kitchen', 'patio or balcony', 
    'elevator', 'air conditioning', 'long_term', 'short_term', 'possible_long_term', 
    'number_of_reviews', 'review_scores_rating'
]

# Apply the new order to the DataFrame
listings_prepared_bcn = listings_prepared_mad[ordered_columns]

# Display the first few rows of the reordered DataFrame
print(listings_prepared_bcn.head())

# Export the reordered DataFrame to a CSV file
listings_prepared_bcn.to_csv('listings_prepared_bcn.csv', index=False)

       id                          listing_url  \
0  269467  https://www.airbnb.com/rooms/269467   
1  270197  https://www.airbnb.com/rooms/270197   
2   18674   https://www.airbnb.com/rooms/18674   
3  272282  https://www.airbnb.com/rooms/272282   
4   23197   https://www.airbnb.com/rooms/23197   

                                                name  host_id       host_name  \
0       Sunny and quiet Loft next to Sagrada Familia  1411775        Jeremies   
1    *monthly stays* 4 Bedroom Apartment near Rambla  1414702          Ferran   
2    Huge flat for 8 people close to Sagrada Familia    71615   Mireia  Maria   
3             CHARMING, VERY SPACIOUS & BEST LOCATED  1425070             Mar   
4  Forum CCIB DeLuxe, Spacious, Large Balcony, relax    90417  Etain (Marnie)   

   calculated_host_listings_count host_is_superhost  \
0                               3                 f   
1                              27                 f   
2                              27              