In [83]:
#Importamos librerías
import numpy as np # data arrays
import pandas as pd # data structure and data analysis
import matplotlib as plt # data visualization
import datetime as dt # date time

In [84]:
#Abrimos archivo 
df_airbnb = pd.read_csv("airbnb-listings.csv", sep=";")

In [85]:
#Contamos cuántas columnas y filas tiene el dataset
print('Columnas : ', len(df_airbnb.columns))
print('Filas    : ', len(df_airbnb.index))

Columnas :  89
Filas    :  14780


In [86]:
#Imprimimos todas las columnas en formato lista
print(df_airbnb.columns.tolist())

['ID', 'Listing Url', 'Scrape ID', 'Last Scraped', 'Name', 'Summary', 'Space', 'Description', 'Experiences Offered', 'Neighborhood Overview', 'Notes', 'Transit', 'Access', 'Interaction', 'House Rules', 'Thumbnail Url', 'Medium Url', 'Picture Url', 'XL Picture Url', 'Host ID', 'Host URL', 'Host Name', 'Host Since', 'Host Location', 'Host About', 'Host Response Time', 'Host Response Rate', 'Host Acceptance Rate', 'Host Thumbnail Url', 'Host Picture Url', 'Host Neighbourhood', 'Host Listings Count', 'Host Total Listings Count', 'Host Verifications', 'Street', 'Neighbourhood', 'Neighbourhood Cleansed', 'Neighbourhood Group Cleansed', 'City', 'State', 'Zipcode', 'Market', 'Smart Location', 'Country Code', 'Country', 'Latitude', 'Longitude', 'Property Type', 'Room Type', 'Accommodates', 'Bathrooms', 'Bedrooms', 'Beds', 'Bed Type', 'Amenities', 'Square Feet', 'Price', 'Weekly Price', 'Monthly Price', 'Security Deposit', 'Cleaning Fee', 'Guests Included', 'Extra People', 'Minimum Nights', 'Max

In [87]:
#Sumamos los valores nulos que tienen cada una de las columnas
missing_values_count = df_airbnb.isnull().sum()
missing_values_count

ID                                   0
Listing Url                          0
Scrape ID                            0
Last Scraped                         0
Name                                 1
                                  ... 
Cancellation Policy                  0
Calculated host listings count       4
Reviews per Month                 3162
Geolocation                          0
Features                             1
Length: 89, dtype: int64

In [88]:
#Ordenamos de mayor a menor las columnas con más valores nulos
missing_values_count.sort_values(ascending=False)

Has Availability        14768
Host Acceptance Rate    14741
Jurisdiction Names      14553
License                 14431
Square Feet             14182
                        ...  
Minimum Nights              0
Extra People                0
Guests Included             0
Room Type                   0
ID                          0
Length: 89, dtype: int64

In [89]:
#Devuelve True si la suma de valores nulos es mayor a 1000
missing_values_count>1000

ID                                False
Listing Url                       False
Scrape ID                         False
Last Scraped                      False
Name                              False
                                  ...  
Cancellation Policy               False
Calculated host listings count    False
Reviews per Month                  True
Geolocation                       False
Features                          False
Length: 89, dtype: bool

In [90]:
#Obtenemos las columnas cuya suma de valores nulos es mayor a 1000
nulldata = df_airbnb.isnull().sum()
print(nulldata[nulldata > 1000].sort_values(ascending=False))

Has Availability                14768
Host Acceptance Rate            14741
Jurisdiction Names              14553
License                         14431
Square Feet                     14182
Monthly Price                   11219
Weekly Price                    11190
Notes                            9136
Security Deposit                 8524
Interaction                      6552
Access                           6462
Cleaning Fee                     6093
Transit                          5714
Neighborhood Overview            5646
Host About                       5241
Neighbourhood                    5229
House Rules                      5161
Space                            3892
Host Neighbourhood               3876
Review Scores Value              3341
Review Scores Location           3340
Review Scores Checkin            3337
Review Scores Accuracy           3326
Review Scores Cleanliness        3320
Review Scores Communication      3320
Review Scores Rating             3304
Last Review 

In [91]:
#A pesar de que el dataset es de Madrid podemos observar como hay otros paises y ciudades 
print(pd.unique(df_airbnb["Country"]))

print(pd.unique(df_airbnb["City"]))

['United Kingdom' 'Spain' 'United States' 'Canada' 'Switzerland'
 'Hong Kong' 'Cuba' 'Germany' 'Belgium' 'Netherlands' 'Italy' 'Australia'
 'Austria' 'France' 'Denmark' 'Ireland' 'Greece' nan]
['London' 'Madrid' 'Denver' 'Palma' 'Palma de Mallorca' 'Selva'
 'Balearic Islands' 'Sant Joan' 'Montréal' 'Genève' 'Tsim Sha Tsui'
 'Los Angeles' 'Cala Pi' 'Sa Pobla' 'Muro' 'Alcúdia' 'Illetes' '马德里'
 'La Habana' 'Madrid, Comunidad de Madrid, ES' 'Berlin' 'Schaerbeek'
 'Amsterdam' 'Barcelona' 'Austin' 'Antwerpen' 'Chicago' 'Roma'
 'Bondi Beach' 'Seattle' 'Woolloomooloo' 'Vienna' 'North Sydney' 'Paris'
 'Frederiksberg' 'Dublin' 'Edimburgo' 'Bruxelles' 'Copenhagen' 'Sutton'
 'Santa Maria del Camí' 'Pollença' 'Nashville' 'Bunyola'
 'Pozuelo de Alarcón' 'Provensals' 'Deià' 'Costitx' 'Campos'
 'Mile End / Bow' 'Peguera, Calvià' 'Torrenova' "Port d'Andratx"
 'Llucmajor' 'Sant Llorenç des Cardassar' 'Puerto ALCUDIA' 'Sóller'
 'Valldemossa' 'Pollensa' 'Caimari' 'Inca' '馬德里' 'Montreal' 'Brooklyn'
 'Washi

In [92]:
#Filtramos el dataset para quedarnos solamente con aquellas filas donde City sea Madrid y Country Spain
df_airbnb = df_airbnb[(df_airbnb['City'] == "Madrid") & (df_airbnb['Country'] == 'Spain')]

print(pd.unique(df_airbnb["Country"]))
print(pd.unique(df_airbnb["City"]))


['Spain']
['Madrid']


In [93]:
#Creamos un nuevo df donde seleccionamos las columnas cuyos valores sean compatibles con lo que queremos estudiar
df = df_airbnb[['ID','Host ID', 'Host Since', 'Host Name', 'Street', 'Neighbourhood Cleansed', 'City', 'State', 'Zipcode', 
                'Smart Location', 'Country Code', 'Country', 'Latitude', 'Longitude', 'Property Type', 'Room Type', 'Bathrooms', 'Bedrooms', 'Beds', 'Bed Type',
                'Price','Number of Reviews', 'Cancellation Policy', 'Geolocation']]


In [94]:
#Contamos nuevamente cuántas columnas y filas tiene el nuevo df
print('Columnas : ', len(df.columns))
print('Filas    : ', len(df.index))

Columnas :  24
Filas    :  13207


In [95]:
#Observamos que en State hay muchos nombres diferentes para la Comunidad de Madrid
pd.unique(df["State"])


array(['Community of Madrid', 'Comunidad de Madrid', 'Madrid', nan,
       'Communauté de Madrid', 'Spain', 'MADRID', 'España',
       'Autonome Gemeinschaft Madrid', 'Madrid capital', 'madrid', '28',
       'Madrid \r\nMadrid', 'Espańa', 'Usera', 'España,Madrid'],
      dtype=object)

In [96]:
#Cambiamos todos los valores de la columna State para que sean todos Comunidad de Madrid
df['State']='Comunidad de Madrid'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['State']='Comunidad de Madrid'


In [97]:
pd.unique(df["State"])

array(['Comunidad de Madrid'], dtype=object)

Revisión de columnas

In [98]:
missing_values_count_2 = df.isnull().sum()
missing_values_count_2

ID                          0
Host ID                     0
Host Since                  3
Host Name                   3
Street                      0
Neighbourhood Cleansed      0
City                        0
State                       0
Zipcode                   439
Smart Location              0
Country Code                0
Country                     0
Latitude                    0
Longitude                   0
Property Type               0
Room Type                   0
Bathrooms                  49
Bedrooms                   23
Beds                       49
Bed Type                    0
Price                       9
Number of Reviews           0
Cancellation Policy         0
Geolocation                 0
dtype: int64

In [99]:
#Creamos nuevo dataframe con los valores NaN eliminados
airbnb = df.dropna()


In [100]:
#Calculamos columnas y filas
print('Columnas : ', len(airbnb.columns))
print('Filas    : ', len(airbnb.index))

Columnas :  24
Filas    :  12685


In [101]:
#Verificamos que la suma de nulls en todas las columnas sea 0
missing_values = airbnb.isnull().sum()
missing_values

ID                        0
Host ID                   0
Host Since                0
Host Name                 0
Street                    0
Neighbourhood Cleansed    0
City                      0
State                     0
Zipcode                   0
Smart Location            0
Country Code              0
Country                   0
Latitude                  0
Longitude                 0
Property Type             0
Room Type                 0
Bathrooms                 0
Bedrooms                  0
Beds                      0
Bed Type                  0
Price                     0
Number of Reviews         0
Cancellation Policy       0
Geolocation               0
dtype: int64

In [102]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12685 entries, 3 to 14759
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      12685 non-null  int64  
 1   Host ID                 12685 non-null  int64  
 2   Host Since              12685 non-null  object 
 3   Host Name               12685 non-null  object 
 4   Street                  12685 non-null  object 
 5   Neighbourhood Cleansed  12685 non-null  object 
 6   City                    12685 non-null  object 
 7   State                   12685 non-null  object 
 8   Zipcode                 12685 non-null  object 
 9   Smart Location          12685 non-null  object 
 10  Country Code            12685 non-null  object 
 11  Country                 12685 non-null  object 
 12  Latitude                12685 non-null  float64
 13  Longitude               12685 non-null  float64
 14  Property Type           12685 non-null

In [103]:
airbnb["Host Since"]

3        2014-02-03
4        2011-07-12
7        2013-10-31
8        2014-05-08
9        2014-05-08
            ...    
14755    2015-06-24
14756    2016-03-01
14757    2012-09-11
14758    2012-09-14
14759    2013-01-03
Name: Host Since, Length: 12685, dtype: object

In [104]:
#Convertimos Host Since a dtype datatime64  
airbnb["Host Since"] = pd.to_datetime(airbnb["Host Since"], format="%Y/%m/%d")

# print the first 5 rows of "ActivityDate" to confirm
airbnb["Host Since"].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airbnb["Host Since"] = pd.to_datetime(airbnb["Host Since"], format="%Y/%m/%d")


3   2014-02-03
4   2011-07-12
7   2013-10-31
8   2014-05-08
9   2014-05-08
Name: Host Since, dtype: datetime64[ns]

In [105]:
#Hay Zipcodes con numeros faltantes, de más o mal escritos
pd.unique(airbnb['Zipcode'])


array(['28005', '28013', '28012', '28014', '28004', '28015', '28008',
       '28045', '28017', '28027', '28043', '28021', '28003', '28010',
       '28002', '28016', '28036', '28039', '28020', '28046', '28029',
       '28028', '28006', '28001', '28009', '28007', '28035', '28011',
       '28047', '28024', '28044', '28019', '28025', '28038', '28053',
       '28026', '28018', '28030', '28031', '28051', '28052', '28032',
       '28037', '28022', '28042', '28054', '28041', '28058', '28034',
       '28050', '28049', '28033', '28055', '27013', '28023', '28850',
       '28040', '20126', '28056', '28060', '2015', '27004', '28', '2805',
       '25008', '20013', '28051\r\n28051', '280013', '28094',
       '28002\r\n28002', '2804', '2815', 'Madrid 28004', '28105', '28048'],
      dtype=object)

In [106]:
airbnb.drop(airbnb[airbnb['Zipcode']=='28'].index, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  airbnb.drop(airbnb[airbnb['Zipcode']=='28'].index, inplace = True)


In [107]:
pd.unique(airbnb['Zipcode'])

array(['28005', '28013', '28012', '28014', '28004', '28015', '28008',
       '28045', '28017', '28027', '28043', '28021', '28003', '28010',
       '28002', '28016', '28036', '28039', '28020', '28046', '28029',
       '28028', '28006', '28001', '28009', '28007', '28035', '28011',
       '28047', '28024', '28044', '28019', '28025', '28038', '28053',
       '28026', '28018', '28030', '28031', '28051', '28052', '28032',
       '28037', '28022', '28042', '28054', '28041', '28058', '28034',
       '28050', '28049', '28033', '28055', '27013', '28023', '28850',
       '28040', '20126', '28056', '28060', '2015', '27004', '2805',
       '25008', '20013', '28051\r\n28051', '280013', '28094',
       '28002\r\n28002', '2804', '2815', 'Madrid 28004', '28105', '28048'],
      dtype=object)

In [None]:
#Creamos un nuevo csv con el dataframe limpio
#airbnb.to_csv('airbnb_clean.csv', index=False)