# Airbnb 2023 Madrid

In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
pd.set_option('display.max_columns', None)

In [2]:
# Función para el análisis inicial de los datos facilitados

def analisis_inicial(file_path):
    # Importamos el fichero origen formato .csv
    df = pd.read_csv(file_path, index_col=0)

    # Exploramos el título de las columnas
    print("Título de las columnas:")
    print(df.columns)

    # Exploramos el número de duplicados en el DataFrame
    num_duplicates = df.duplicated().sum()
    print(f"\nNúmero de duplicados en el DataFrame: {num_duplicates}")

    # Mostramos las primeras filas del DataFrame
    print(df.info())

    # Devolvemos el DataFrame creado
    return df

In [3]:
df_raw = analisis_inicial("2023_Airbnb_Madrid.csv")

Título de las columnas:
Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'review_score',
       'host_listings_count', 'availability_365', 'quarter'],
      dtype='object')

Número de duplicados en el DataFrame: 4
<class 'pandas.core.frame.DataFrame'>
Index: 95134 entries, 0 to 95133
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   95134 non-null  float64
 1   name                 95130 non-null  object 
 2   host_id              95134 non-null  int64  
 3   host_name            95114 non-null  object 
 4   neighbourhood_group  95134 non-null  object 
 5   neighbourhood        95134 non-null  object 
 6   latitude             95134 non-null  object 
 7   longitude            95134 non-null  object 
 8   room_type            95134 non-null  obje

In [4]:
# cambio títulos de las columnas
#df_raw = df_raw.rename(columns={'id': 'listing_id'})
#df_raw = df_raw.rename(columns={'name': 'listing_name'})

In [5]:
# cambio tipo de dato a object
df_raw['id'] = df_raw['id'].astype('object')
df_raw['host_id'] = df_raw['host_id'].astype('object')

# este dato no tiene decimales
df_raw['review_score'] = df_raw['review_score'].str.replace(',', '')
df_raw['review_score'] = df_raw['review_score'].str.pad(width=1, side='right', fillchar='0')
df_raw['review_score'] = df_raw['review_score'].astype('Int64')

### latitud y longitud

In [6]:
df_raw['latitude'] = df_raw['latitude'].str.replace('.', '')
df_raw['longitude'] = df_raw['longitude'].str.replace('.', '')

In [7]:
# Añadir ceros al final de los valores en la columna 'latitude' para que todos tengan 6 dígitos
df_raw['latitude'] = df_raw['latitude'].str.pad(width=6, side='right', fillchar='0')
df_raw['longitude'] = df_raw['longitude'].str.pad(width=6, side='right', fillchar='0')

In [8]:
# Truncar los valores en la columnas 'latitude' y 'longitude' para que sólo aparezcan los primeros 6 dígitos
df_raw['latitude'] = df_raw['latitude'].str.slice(0, 6)
df_raw['longitude'] = df_raw['longitude'].str.slice(0, 6)

In [9]:
df_raw['latitude'] = df_raw['latitude'].astype(float)
df_raw['longitude'] = df_raw['longitude'].astype(float)

In [10]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95134 entries, 0 to 95133
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   95134 non-null  object 
 1   name                 95130 non-null  object 
 2   host_id              95134 non-null  object 
 3   host_name            95114 non-null  object 
 4   neighbourhood_group  95134 non-null  object 
 5   neighbourhood        95134 non-null  object 
 6   latitude             95134 non-null  float64
 7   longitude            95134 non-null  float64
 8   room_type            95134 non-null  object 
 9   price                91325 non-null  float64
 10  minimum_nights       95134 non-null  int64  
 11  number_of_reviews    95134 non-null  int64  
 12  review_score         95134 non-null  Int64  
 13  host_listings_count  95134 non-null  int64  
 14  availability_365     95134 non-null  int64  
 15  quarter              95134 non-null  obje

In [11]:
df_raw.to_csv('2023_Madrid_Airbnb_reworked.csv', index=False)