CODE NEIGHBORS

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np
import pickle
import joblib

In [2]:
df = pd.read_csv('data/df_processed_to_ML.csv')
df_cleaned = pd.read_csv('data/df_final_cleaned.csv')

In [3]:
df.shape

(2497, 26)

In [4]:
# Agregamos columna url
df['url'] = df_cleaned['urls']

In [5]:
# Ponemos url en la primera columna
df = df[['url'] + [col for col in df.columns if col != 'url']]

In [6]:
df = df.drop(columns = 'prices_per_night')

In [8]:
df.columns

Index(['url', 'ratings', 'cleaning_fee', 'dormitorios', 'camas', 'baños',
       'maximum_guests', 'check_in_hour', 'check_out_hour',
       'total_hours_checkin', 'log_num_reviews',
       'aparcamiento e instalaciones', 'baño', 'calefacción y refrigeración',
       'características de la ubicación', 'cocina y comedor',
       'dormitorio y lavandería', 'entretenimiento', 'exterior',
       'internet y oficina', 'para familias', 'privacidad y seguridad',
       'seguridad en el hogar', 'servicios', 'habitacion',
       'alojamiento entero'],
      dtype='object')

In [15]:
# Establecemos las columnas en una lista
columns = df.columns.to_list()

# Columnas que queremos eliminar
delete_column = 'url'

# Eliminamos columna
if delete_column in columns:
    columns.remove(delete_column)

In [17]:
features =  columns

In [19]:
features

['ratings',
 'cleaning_fee',
 'dormitorios',
 'camas',
 'baños',
 'maximum_guests',
 'check_in_hour',
 'check_out_hour',
 'total_hours_checkin',
 'log_num_reviews',
 'aparcamiento e instalaciones',
 'baño',
 'calefacción y refrigeración',
 'características de la ubicación',
 'cocina y comedor',
 'dormitorio y lavandería',
 'entretenimiento',
 'exterior',
 'internet y oficina',
 'para familias',
 'privacidad y seguridad',
 'seguridad en el hogar',
 'servicios',
 'habitacion',
 'alojamiento entero']

In [21]:
len(features)

25

## Entrenamos el modelo

In [25]:
model = NearestNeighbors(n_neighbors=3, metric='euclidean')
model.fit(df[features])

In [27]:
with open('data/NearestNeighbors.pkl', 'wb') as file: 
    pickle.dump(model, file)

In [30]:
with open('data/NearestNeighbors.pkl', 'rb') as file:
    modelo_cargado = pickle.load(file)

In [27]:
modelo_cargado

## Probamos el modelo con un ejemplo

In [29]:
scaled_features[0].reshape(1,-1)

array([[ 0.85197416, -2.59285049, -0.833383  , -0.60405602, -0.64994499,
         0.1718003 , -0.53889146,  0.18909753,  1.20890787, -0.18909753,
        -1.23718906, -0.11803319,  0.72075652,  0.30637193,  1.93028684,
        -0.39997518,  0.46070669, -0.04672206,  1.02446558, -0.88618797,
        -0.59884741, -0.60134884, -0.01027986,  1.41163695, -0.9774273 ,
        -0.07774004]])

In [31]:
example_input = scaled_features[0].reshape(1, -1)
distances, indices = modelo_cargado.kneighbors(example_input)

In [33]:
recommended_properties = df.iloc[indices[0]]
recommended_properties

Unnamed: 0,url,prices_per_night,ratings,cleaning_fee,dormitorios,camas,baños,maximum_guests,check_in_hour,check_out_hour,...,dormitorio y lavandería,entretenimiento,exterior,internet y oficina,para familias,privacidad y seguridad,seguridad en el hogar,servicios,habitacion,alojamiento entero
0,https://www.airbnb.es/rooms/126311759129279497...,115.0,0.0,0.0,1.0,1.0,1.0,2.0,900.0,720.0,...,7.0,1.0,2.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0
586,https://www.airbnb.es/rooms/124268045187759302...,115.0,0.0,30.0,1.0,1.0,1.0,3.0,900.0,720.0,...,7.0,1.0,2.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0
40,https://www.airbnb.es/rooms/126441709022765022...,90.0,0.0,40.0,1.0,1.0,1.0,2.0,900.0,660.0,...,6.0,2.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0


In [37]:
df.shape

(2497, 27)

## Creacion dataframe para el recomendador

In [25]:
df2 = pd.read_csv('data/df_nlp.csv')

In [26]:
df2.head(2)

Unnamed: 0,url,rating_promedio,cantidad_comentarios,comentarios,resultados,polaridad_media,subjetividad_media,palabras_mas_usadas,sentimiento
0,https://www.airbnb.es/rooms/100044781045691589...,4.645161,31,['apartamento bien situado para visitar barcel...,{'resultados_individuales': [{'comentario_orig...,0.353185,0.575261,"[('apartamento', 19), ('est', 16), ('limpio', ...",Positivo
1,https://www.airbnb.es/rooms/100051458827470719...,4.318182,44,['el departamento es bueno para el precio y la...,{'resultados_individuales': [{'comentario_orig...,0.290322,0.552236,"[('bien', 17), ('ubicacin', 16), ('apartamento...",Positivo


In [27]:
df2.shape

(2456, 9)

In [28]:
# Quitamos elementos que modifican el link
df2['url'] = df2['url'].str.replace('/reviews', '', regex=False)

In [29]:
df2.columns

Index(['url', 'rating_promedio', 'cantidad_comentarios', 'comentarios',
       'resultados', 'polaridad_media', 'subjetividad_media',
       'palabras_mas_usadas', 'sentimiento'],
      dtype='object')

In [30]:
df2 = df2.drop(columns = ['rating_promedio', 'comentarios', 'resultados'])

In [31]:
df_rec_st = pd.merge(left = df, right = df2, on = 'url', how = 'left')

In [32]:
df_rec_st

Unnamed: 0,url,prices_per_night,ratings,cleaning_fee,dormitorios,camas,baños,maximum_guests,check_in_hour,check_out_hour,...,privacidad y seguridad,seguridad en el hogar,servicios,habitacion,alojamiento entero,cantidad_comentarios,polaridad_media,subjetividad_media,palabras_mas_usadas,sentimiento
0,https://www.airbnb.es/rooms/126311759129279497...,115.0,0.00,0.0,1.0,1.0,1.0,2.0,900.0,720.0,...,0.0,1.0,4.0,0.0,0.0,,,,,
1,https://www.airbnb.es/rooms/127904085557632410...,46.0,0.00,15.0,1.0,1.0,0.5,1.0,1020.0,660.0,...,2.0,0.0,1.0,1.0,0.0,,,,,
2,https://www.airbnb.es/rooms/913187918206344111...,47.0,4.66,0.0,1.0,1.0,0.5,1.0,900.0,720.0,...,1.0,0.0,5.0,1.0,0.0,,,,,
3,https://www.airbnb.es/rooms/126566083301808951...,100.0,4.89,35.0,1.0,1.0,1.0,1.0,960.0,720.0,...,0.0,5.0,4.0,0.0,0.0,10.0,0.312544,0.443554,"[('nuevo', 4), ('excelente', 4), ('limpio', 3)...",Positivo
4,https://www.airbnb.es/rooms/31977850?adults=1&...,33.0,4.40,0.0,1.0,1.0,0.5,1.0,900.0,660.0,...,0.0,3.0,1.0,1.0,0.0,992.0,0.416332,0.562447,"[('buen', 279), ('ubicacin', 257), ('lugar', 2...",Positivo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2492,https://www.airbnb.es/rooms/875950395836707541...,55.0,4.74,10.0,1.0,1.0,0.5,3.0,900.0,660.0,...,3.0,0.0,1.0,1.0,0.0,97.0,0.441081,0.622422,"[('apartamento', 77), ('oscar', 57), ('metro',...",Positivo
2493,https://www.airbnb.es/rooms/40578133?adults=1&...,60.0,4.78,0.0,1.0,1.0,0.5,2.0,900.0,600.0,...,1.0,0.0,1.0,1.0,0.0,146.0,0.362983,0.560971,"[('apartamento', 114), ('limpio', 58), ('metro...",Positivo
2494,https://www.airbnb.es/rooms/22767740?adults=1&...,104.0,4.96,0.0,2.0,3.0,2.0,4.0,900.0,720.0,...,0.0,2.0,3.0,0.0,0.0,203.0,0.437329,0.611136,"[('apartamento', 141), ('playa', 105), ('est',...",Positivo
2495,https://www.airbnb.es/rooms/15809690?adults=1&...,120.0,4.83,50.0,1.0,1.0,1.0,2.0,900.0,660.0,...,0.0,2.0,1.0,0.0,0.0,368.0,0.230538,0.370519,"[('metro', 168), ('apartamento', 135), ('guill...",Positivo


In [33]:
df_rec_st['polaridad_media'] = df_rec_st['polaridad_media'].fillna(0)
df_rec_st['subjetividad_media'] = df_rec_st['subjetividad_media'].fillna(0)
df_rec_st['cantidad_comentarios'] = df_rec_st['cantidad_comentarios'].fillna(0)

In [34]:
df_rec_st.to_csv('data/df_rec_st.csv', index = False)

In [35]:
df_rec_st.columns

Index(['url', 'prices_per_night', 'ratings', 'cleaning_fee', 'dormitorios',
       'camas', 'baños', 'maximum_guests', 'check_in_hour', 'check_out_hour',
       'total_hours_checkin', 'log_num_reviews',
       'aparcamiento e instalaciones', 'baño', 'calefacción y refrigeración',
       'características de la ubicación', 'cocina y comedor',
       'dormitorio y lavandería', 'entretenimiento', 'exterior',
       'internet y oficina', 'para familias', 'privacidad y seguridad',
       'seguridad en el hogar', 'servicios', 'habitacion',
       'alojamiento entero', 'cantidad_comentarios', 'polaridad_media',
       'subjetividad_media', 'palabras_mas_usadas', 'sentimiento'],
      dtype='object')

In [55]:
df_rec_st['prices_per_night'].max()

450.0