In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv("Datasets/Dataset_Originales/geographic_distribution_mad.csv")
print(df.head())

            latitude            longitude
0         40.4069796           -3.6750154
1            40.3921             -3.67003
2           40.39511             -3.67143
3  40.39555483922258  -3.7164371179819233
4           40.43045              -3.6721


In [3]:
df.columns

Index(['latitude', 'longitude'], dtype='object')

In [4]:
df.dtypes

latitude     object
longitude    object
dtype: object

In [None]:
# Count unique IDs in df
df['host_id'].nunique()

12834

In [None]:
# See unique values in room_type column
df['room_type'].unique()

array(['Private room', 'Shared room', 'Entire home/apt', 'Hotel room'],
      dtype=object)

In [None]:
# Count number of each value in room_type column
df["room_type"].value_counts()

room_type
Entire home/apt    17346
Private room        9103
Shared room          381
Hotel room            94
Name: count, dtype: int64

In [None]:
# View the above as percentages of the total
df["room_type"].value_counts(normalize=True) * 100

room_type
Entire home/apt    64.425791
Private room       33.809984
Shared room         1.415094
Hotel room          0.349131
Name: proportion, dtype: float64

In [None]:
# See unique values in neighbourhood column
df['neighbourhood'].unique()

array([nan, 'Madrid, Comunidad de Madrid, Spain',
       'Madrid, Community of Madrid, Spain', 'Madrid, Spain',
       'Chueca, Madrid, Spain', 'Madrid , Comunidad de Madrid, Spain',
       'MADRID, Spain', 'Madrid, Carabanchel alto, Spain',
       'Madrid, Co, Spain', 'Madrid , Madrid, Spain',
       'Madrid Center, Madrid, Spain',
       'Pozuelo de Alarcón, Comunidad de Madrid, Spain',
       'Madrid, SPAIN  Madrid Madrid, Spain',
       'Madrid, Madrid, Barajas, Spain',
       'Madrid, Comunidad de Madrid, ES, Madrid, Spain',
       'ROSES (ROSAS ) ESPAGNE, ROSES, Spain',
       'Madrid, Palos de Moguer , Spain',
       'Madrid, Comunidad de Madrid, España, Spain', 'Spain',
       'Madrid, Centro Madrid, Spain',
       'Madrid, Spain Madrid Madrid, Spain', 'Madrid, MADRID, Spain',
       'Madrid , Comunidad de Madrid, España, Spain',
       'Madrid, Comunidad de  Madrid, Spain', 'Salamanca, Madrid, Spain',
       'Madrid, Comunidad de Madrid , Atocha, Spain',
       'Comunidad de M

In [None]:
# Count number of each value in neighbourhood column
df["neighbourhood"].value_counts()

neighbourhood
Madrid, Comunidad de Madrid, Spain                           12419
Madrid, Community of Madrid, Spain                             465
Madrid, Spain                                                  335
Madrid, Co, Spain                                               22
Madrid Center, Madrid, Spain                                    10
MADRID, Spain                                                    9
Madrid , Madrid, Spain                                           7
Chueca, Madrid, Spain                                            5
Madrid , Comunidad de Madrid, Spain                              3
Pozuelo de Alarcón, Comunidad de Madrid, Spain                   3
Madrid, Comunidad de Madrid , Atocha, Spain                      2
Madrid, Comunidad de  Madrid, Spain                              2
Spain                                                            2
Madrid, Comunidad de Madrid, España, Spain                       2
Madrid, Comunidad de Madrid, ES, Madrid, Spain  

In [None]:
#Crear nuevo dataset con menos columnas

# Creating the new DataFrame with the required columns
listings_prepared_mad = df[['host_id', 'host_name', 'calculated_host_listings_count', 
                            'host_is_superhost', 'listing_url', 'id', 'name', 
                            'neighbourhood_group_cleansed', 'latitude', 'longitude', 
                            'room_type', 'bedrooms', 'price', 'number_of_reviews', 
                            'review_scores_rating', 'license']].copy()

# Adding 'kitchen', 'patio or balcony', 'elevator', 'air conditioning' columns based on 'amenities'
listings_prepared_mad['kitchen'] = df['amenities'].str.contains('kitchen', case=False, na=False).astype(int)
listings_prepared_mad['patio or balcony'] = df['amenities'].str.contains('patio|balcony', case=False, na=False).astype(int)
listings_prepared_mad['elevator'] = df['amenities'].str.contains('elevator', case=False, na=False).astype(int)
listings_prepared_mad['air conditioning'] = df['amenities'].str.contains('air conditioning', case=False, na=False).astype(int)

# Adding 'long_term', 'short_term', 'possible_long_term' columns based on 'minimum_nights' and 'maximum_nights'
listings_prepared_mad['long_term'] = (df['minimum_nights'] > 31).astype(int)
listings_prepared_mad['short_term'] = (df['maximum_nights'] <= 31).astype(int)
listings_prepared_mad['possible_long_term'] = ((df['minimum_nights'] >= 1) & (df['maximum_nights'] >= 31)).astype(int)

# Display the first few rows of the new dataframe
print(listings_prepared_mad.head())


     host_id        host_name  calculated_host_listings_count  \
0  565307927         Carolina                               1   
1  442944608   Jonathan Jesus                               2   
2  252336445            Norma                               2   
3  545435841       Gean Piero                               1   
4    3882456  Gabriela CHÁVEZ                               1   

  host_is_superhost                                       listing_url  \
0                 f  https://www.airbnb.com/rooms/1104967621421098322   
1                 f   https://www.airbnb.com/rooms/930941073152690361   
2                 f             https://www.airbnb.com/rooms/51647382   
3                 f  https://www.airbnb.com/rooms/1026072412626070839   
4                 f   https://www.airbnb.com/rooms/766423030884644417   

                    id                              name  \
0  1104967621421098322  Habitación de lujo, baño privado   
1   930941073152690361            habitación para 

In [None]:
# Export the DataFrame to an Excel file
listings_prepared_mad.to_excel('listings_prepared_mad.xlsx', index=False)

In [None]:
# Reordering the columns
ordered_columns = [
    'id', 'listing_url', 'name', 'host_id', 'host_name', 'calculated_host_listings_count', 
    'host_is_superhost', 'license', 'neighbourhood_group_cleansed', 'latitude', 
    'longitude', 'room_type', 'bedrooms', 'price', 'kitchen', 'patio or balcony', 
    'elevator', 'air conditioning', 'long_term', 'short_term', 'possible_long_term', 
    'number_of_reviews', 'review_scores_rating'
]

# Apply the new order to the DataFrame
listings_prepared_mad = listings_prepared_mad[ordered_columns]

# Display the first few rows of the reordered DataFrame
print(listings_prepared_mad.head())

# Export the reordered DataFrame to a CSV file
listings_prepared_mad.to_csv('listings_prepared_mad.csv', index=False)

                    id                                       listing_url  \
0  1104967621421098322  https://www.airbnb.com/rooms/1104967621421098322   
1   930941073152690361   https://www.airbnb.com/rooms/930941073152690361   
2             51647382             https://www.airbnb.com/rooms/51647382   
3  1026072412626070839  https://www.airbnb.com/rooms/1026072412626070839   
4   766423030884644417   https://www.airbnb.com/rooms/766423030884644417   

                               name    host_id        host_name  \
0  Habitación de lujo, baño privado  565307927         Carolina   
1            habitación para pareja  442944608   Jonathan Jesus   
2            Buenos Aires en Madrid  252336445            Norma   
3            Habitación Madrid Rio!  545435841       Gean Piero   
4           habitación para turista    3882456  Gabriela CHÁVEZ   

   calculated_host_listings_count host_is_superhost license  \
0                               1                 f     NaN   
1             