### **Fase 3 - SQL**
En esta fase, se diseñó e implementó una base de datos en MySQL con el objetivo de optimizar la gestión de datos en el proyecto.
La migración de los datos a MySQL proporciona una gestión más eficiente, permitiendo realizar operaciones de lectura y escritura de manera más rápida y segura. Además, el uso de SQL para consultas complejas hace que el acceso a los datos sea más flexible, permitiendo extraer información específica de forma eficiente y en tiempo real.

La base de datos no solo facilita la gestión de grandes volúmenes de datos, sino que también garantiza la integridad y consistencia de la información, ofreciendo una infraestructura robusta para las siguientes etapas del proyecto.

In [1]:
import numpy as np
import pandas as pd

import mysql.connector

In [2]:
df = pd.read_csv("data/df_recomendador.csv")
df.head(3)
df.shape

(2526, 22)

In [3]:
df_service = pd.read_csv("data/df_servicios_final_cleaned.csv")
df_service.head(3)

Unnamed: 0,category,urls,services
0,baño,https://www.airbnb.es/rooms/126311759129279497...,secador de pelo
1,baño,https://www.airbnb.es/rooms/126311759129279497...,productos de limpieza
2,baño,https://www.airbnb.es/rooms/126311759129279497...,champú


In [4]:
df3 = pd.merge(left = df, right = df_service, on = "urls")[["urls", "record_id", "services", "category"]]

df3.head(3)

Unnamed: 0,urls,record_id,services,category
0,https://www.airbnb.es/rooms/126311759129279497...,1263117591292794971,secador de pelo,baño
1,https://www.airbnb.es/rooms/126311759129279497...,1263117591292794971,productos de limpieza,baño
2,https://www.airbnb.es/rooms/126311759129279497...,1263117591292794971,champú,baño


In [5]:
tabla_service = pd.DataFrame(data = [[num, v] for num, v in enumerate(df3["services"].unique(), start = 1)],
                             columns = ["service_id", "service"])
tabla_service.head(3)

Unnamed: 0,service_id,service
0,1,secador de pelo
1,2,productos de limpieza
2,3,champú


In [6]:
tabla_category = pd.DataFrame(data = [[num, v] for num, v in enumerate(df3["category"].unique(), start = 1)],
                             columns = ["category_id", "category"])
tabla_category.head(3)

Unnamed: 0,category_id,category
0,1,baño
1,2,dormitorio y lavandería
2,3,entretenimiento


In [7]:
map_category = {k : v for v, k in tabla_category.values} 
map_service = {k : v for v, k in tabla_service.values} 

In [8]:
tabla_category_service = df3[["services", "category"]]

tabla_category_service["category"] = tabla_category_service["category"].map(map_category)
tabla_category_service["services"] = tabla_category_service["services"].map(map_service)

tabla_category_service

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tabla_category_service["category"] = tabla_category_service["category"].map(map_category)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tabla_category_service["services"] = tabla_category_service["services"].map(map_service)


Unnamed: 0,services,category
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
70598,28,10
70599,71,10
70600,31,11
70601,72,11


In [9]:
tabla_service_record = df3[["record_id", "services"]]

tabla_service_record["services"] = tabla_service_record["services"].map(map_service)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tabla_service_record["services"] = tabla_service_record["services"].map(map_service)


## SQL

In [11]:
db = mysql.connector.connect(
     host="localhost",
     user="root",
     password="12345"#Aqui teneis que poner vuestra contraeña.
)

cursor = db.cursor()

# Crear la base de datos si no existe
cursor.execute('CREATE DATABASE IF NOT EXISTS AIRBNB')

# Conectar a la base de datos recién creada
db.database = 'AIRBNB'

# Crear la tabla Hosting
cursor.execute('''CREATE TABLE IF NOT EXISTS Hosting (
        record_id BIGINT,
        titles TEXT,
        property_types TEXT,
        host_name TEXT,
        PRIMARY KEY (record_id));''')

# Crear la tabla Description_
cursor.execute('''CREATE TABLE IF NOT EXISTS Description_ (
        id BIGINT,
        prices_per_night INT,
        check_in_hour TIME,
        check_out_hour TIME,
        total_hours_checkin INT,
        cleaning_fee INT,
        maximum_guests INT,
        camas INT,
        baños INT,
        dormitorios INT,
        FOREIGN KEY (id) REFERENCES Hosting(record_id));''')

# Crear la tabla Ratings
cursor.execute('''CREATE TABLE IF NOT EXISTS Ratings (
        record_id BIGINT,
        ratings FLOAT,
        num_reviews FLOAT,
        polaridad_media FLOAT,
        subjetividad_media FLOAT,
        palabras_mas_usadas TEXT,
        sentimiento TEXT,
        FOREIGN KEY (record_id) REFERENCES Hosting(record_id));''')

#Creamos la tabla de Services_
cursor.execute('''CREATE TABLE IF NOT EXISTS Services_ (
        service_id INT,
        service TEXT,
        PRIMARY KEY (service_id));''')
#Creamos la tabla de Service_ids
cursor.execute('''CREATE TABLE IF NOT EXISTS Services_Hosting (
        service_id INT,
        record_id BIGINT,
        FOREIGN KEY (service_id) REFERENCES Services_(service_id),
        FOREIGN KEY (record_id) REFERENCES Hosting (record_id));''')   
#Creamos la tabla de Category
cursor.execute('''CREATE TABLE IF NOT EXISTS Category (
        category_id INT,
        category TEXT,
        PRIMARY KEY (category_id));''')

#Creamos la tabla de Category_ids
cursor.execute('''CREATE TABLE IF NOT EXISTS Category_Services (
        service_id INT,
        category_id INT,
        FOREIGN KEY (service_id) REFERENCES Services_(service_id),
        FOREIGN KEY (category_id) REFERENCES Category(category_id));''')   

# Cerrar el cursor y la conexión
cursor.close()
db.close()

In [12]:
def insert_to_table(df, database, table_name):

    db = mysql.connector.connect(host="localhost",
                                 user="root",
                                 password="12345",#Aqui teneis que poner vuestra contraeña.
                                 database=database,
                                 consume_results = True)
    
    cursor = db.cursor()
    
    # Seleccionamos las columnas de la tabla, omitiendo la Primary Key
    cursor.execute(f"SELECT * FROM {table_name} LIMIT 0;")
    column_names = cursor.column_names
    
    # Aseguramos que todas las filas del DataFrame tengan el mismo número de columnas que en la tabla
    if len(df.columns) != len(column_names):
        raise ValueError("El número de columnas en el DataFrame no coincide con el número de columnas en la tabla de la base de datos.")
    
    # Preparar la consulta de inserción
    insert_query = f"INSERT INTO {table_name} ({', '.join(column_names)}) VALUES ({', '.join(['%s' for _ in column_names])})"
    
    # Convertir las filas del DataFrame a tuplas
    values = [tuple(row) for row in df[list(column_names)].values]
    
    # Ejecutar la consulta de inserción
    cursor.executemany(insert_query, values)
    
    # Confirmar los cambios en la base de datos
    db.commit()
    
    print(f"Añadidas: {cursor.rowcount} filas")

    # Vaciamos el cursor
    cursor.close()
    db.close()

In [13]:
# Tabla hosting
tabla_hosting = df[["record_id", "titles", "property_types", "host_name"]].drop_duplicates(subset = ["record_id"])
tabla_hosting = tabla_hosting.replace([np.nan], [None])

In [14]:
database = "AIRBNB"

insert_to_table(df = tabla_hosting, database = database, table_name = "Hosting")
insert_to_table(df = tabla_service, database = database, table_name = "services_")
insert_to_table(df = tabla_category, database = database, table_name = "category")

Añadidas: 1421 filas
Añadidas: 117 filas
Añadidas: 13 filas


In [15]:
tabla_service_record["services"] = tabla_service_record["services"].astype("str")
tabla_service_record.columns = ["record_id", "service_id"]
insert_to_table(df = tabla_service_record, database = database, table_name = "services_hosting")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tabla_service_record["services"] = tabla_service_record["services"].astype("str")


Añadidas: 70603 filas


In [16]:
tabla_category_service.columns = ["service_id", "category_id"]
tabla_category_service["service_id"] = tabla_category_service["service_id"].astype("str")
insert_to_table(df = tabla_category_service, database = database, table_name = "category_services")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tabla_category_service["service_id"] = tabla_category_service["service_id"].astype("str")


Añadidas: 70603 filas


In [17]:
# Tabla ratings

# # USAR ESTE CÓDIGO SI USAN EL CSV final_cleaned_sentiment.csv
# tabla_ratings = df[["record_id", "ratings", "num_reviews", "sentiment"]]
# insert_to_table(df = tabla_ratings, database = database, table_name = "ratings")

# # USAR ESTE CÓDIGO SI USAN df_final_cleaned.csv
tabla_ratings = df[["record_id", "ratings", "num_reviews", "polaridad_media", "subjetividad_media", "palabras_mas_usadas", "sentimiento"]]

tabla_ratings = tabla_ratings.replace([np.nan], [None])

insert_to_table(df = tabla_ratings, database = database, table_name = "ratings")


Añadidas: 2526 filas


In [18]:
# Tabla description_
tabla_description_ = df[["record_id", "prices_per_night", "check_in_hour", "check_out_hour", "total_hours_checkin",
                    "cleaning_fee", "maximum_guests", "camas", "baños", "dormitorios"]]
tabla_description_.columns = ["id", "prices_per_night", "check_in_hour", "check_out_hour", "total_hours_checkin",
                    "cleaning_fee", "maximum_guests", "camas", "baños", "dormitorios"]

tabla_description_ = tabla_description_.replace([np.nan], [None])

insert_to_table(df = tabla_description_, database = database, table_name = "description_")

Añadidas: 2526 filas
