<!DOCTYPE html>
<html>
<head>
  <title>Portada del Proyecto</title>
  <style>
    body {
      font-family: Arial, sans-serif;
      margin: 0;
      padding: 0;
    }
    .container {
      text-align: center;
      padding: 50px;
    }
    .logo {
      float: left;
    }
    .title {
      font-size: 24px;
      font-weight: bold;
      margin-bottom: 20px;
    }
    .subtitle {
      font-size: 18px;
      margin-bottom: 10px;
    }
  </style>
</head>
<body>
  <div class="container">
    <img class="logo" src="https://upload.wikimedia.org/wikipedia/commons/thumb/c/ca/Escudo-UNAM-escalable.svg/1024px-Escudo-UNAM-escalable.svg.png" alt="Logo UNAM" width="100" height="100">
          <img class="logo" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSbYMZtDu3nUCwGimNRUJ0D3cj1BCj4gMUVfDW_nus&s" alt="Logo FES Acatlán" width="100" height="100">
    <div class="title">Título del Proyecto</div>
    <div class="subtitle">Nombre del Alumno</div>
    <div class="subtitle">Nombre de la Materia</div>

  </div>
</body>
</html>


In [14]:
import pandas as pd
import numpy as np
import threading
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Procesamiento de datos

In [28]:
df_listings = pd.read_csv("./listings.csv")
df_listings = df_listings[["listing_url",'description','property_type', 'room_type',  'amenities', 
                           'neighborhood_overview','neighbourhood_cleansed']]

In [29]:
df_listings["description"] = df_listings["description"].fillna("")
df_listings["neighborhood_overview"] = df_listings["neighborhood_overview"].fillna("")

In [30]:
df_listings["amenities"] = df_listings["amenities"].apply(lambda x: x.replace("'", "").replace('"', '').replace('[', '').replace(']', ''))
df_listings["description"] = df_listings["description"].apply(lambda x: re.sub(re.compile(r"<.*?>"), "", x))

In [31]:
df_listings["full_description"] = "Descripcion: " + df_listings["description"] + " Tipo de propiedad: " + df_listings["property_type"] +" Tipo de habitación: " + df_listings["room_type"] +" Amenidades: " + df_listings["amenities"] + " Descripción vecindario: " + df_listings["neighborhood_overview"] +" Vecindario: " + df_listings["neighbourhood_cleansed"]            

In [32]:
df_listings_re = df_listings[['listing_url','full_description']]

In [33]:
df_listings_re

Unnamed: 0,listing_url,full_description
0,https://www.airbnb.com/rooms/35797,Descripcion: Dentro de Villa un estudio de art...
1,https://www.airbnb.com/rooms/783080,Descripcion: The spaceLocated in a private str...
2,https://www.airbnb.com/rooms/44616,Descripcion: A new concept of hosting in mexic...
3,https://www.airbnb.com/rooms/56074,Descripcion: This great apartment is located i...
4,https://www.airbnb.com/rooms/783480,Descripcion: The spaceTwo beautifully furnishe...
...,...,...
23009,https://www.airbnb.com/rooms/791614634401456441,Descripcion: The 2BR apartment is located near...
23010,https://www.airbnb.com/rooms/791635242537214248,Descripcion: The 2BR apartment is located in t...
23011,https://www.airbnb.com/rooms/791890581723840464,Descripcion: The 1BR apartment is located in t...
23012,https://www.airbnb.com/rooms/791934034051102306,Descripcion: Capitalia - Magna ResidencialResi...


### Calculo de similitud

In [None]:
def split_dataframe(df, num_parts):
    return np.array_split(df, num_parts)

In [None]:
def compute_tfidf(sub_df, vectorizer, tfidf_mtx, start_idx):
    sub_tfidf_mtx = vectorizer.transform(sub_df['description'])
    with threading.Lock():
        tfidf_mtx[start_idx:start_idx + len(sub_df)] = sub_tfidf_mtx

In [87]:
import pandas as pd
from collections import defaultdict
import threading

def create_inverted_index_partial(sub_df, column, shared_inverted_index, lock):
    for idx, row in sub_df.iterrows():
        words = set(row[column].lower().split())
        for word in words:
            with lock:
                shared_inverted_index[word].add(idx)

def create_inverted_index_multithreaded(df, column, num_threads=4):
    
    df[column] = df[column].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stopwords.words('spanish')]))
    df_parts = np.array_split(df, num_threads)
    threads = []
    shared_inverted_index = defaultdict(set)
    lock = threading.Lock()

    for sub_df in df_parts:
        thread = threading.Thread(target=create_inverted_index_partial, args=(sub_df, column, shared_inverted_index, lock))
        threads.append(thread)

    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

    return shared_inverted_index

# Ejemplo de uso
column = 'description'
num_threads = 4
inverted_index = create_inverted_index_multithreaded(df_listings.sample(1000).dropna(), column, num_threads)


In [94]:
'cerca' in inverted_index

True

In [97]:
def find_best_match(record, inverted_index, df, column, threshold):
    words = set(record.lower().split())
    matched_records = defaultdict(int)
    
    for word in words:
        if word in inverted_index:
            for idx in inverted_index[word]:
                matched_records[idx] += 1
    
    if not matched_records:
        return "No se encontro Similar"

    best_match = max(matched_records, key=matched_records.get)
    match_score = matched_records[best_match] / len(words)
    
    if match_score >= threshold:
        return df.iloc[best_match, 1], df.iloc[best_match, 0]
    else:
        return "NE", None

# Ejemplo de uso
column = 'description'
threshold = 0.1
record =  "Acogedor apartamento cerca del centro con WiFi gratuito y cocina completa."

result = find_best_match(record, inverted_index, df_listings, column, threshold)
print(result)


defaultdict(<class 'int'>, {7172: 1, 7176: 1, 16912: 2, 15889: 1, 15383: 2, 6687: 2, 8230: 3, 5158: 1, 9770: 2, 5166: 3, 563: 2, 7734: 1, 11832: 3, 570: 1, 21563: 4, 9276: 2, 6209: 1, 16453: 1, 17989: 1, 3657: 1, 3658: 1, 12875: 2, 2637: 3, 5709: 3, 11856: 2, 2642: 2, 15958: 1, 6231: 2, 3161: 2, 1630: 3, 21598: 3, 22117: 1, 8816: 1, 17524: 2, 629: 1, 13945: 2, 4220: 2, 19581: 2, 11904: 1, 22661: 3, 14478: 1, 6802: 2, 11925: 2, 15512: 2, 18079: 4, 2211: 4, 2723: 3, 5289: 1, 15023: 2, 10927: 2, 5811: 2, 6332: 2, 702: 1, 17093: 2, 5831: 1, 22217: 1, 18635: 2, 8399: 3, 22225: 2, 16595: 1, 7893: 2, 3286: 3, 16089: 1, 3290: 1, 17115: 3, 21730: 2, 5860: 1, 16620: 1, 8429: 2, 15608: 2, 12539: 2, 17662: 1, 1282: 2, 6918: 2, 15114: 1, 13066: 2, 20751: 1, 6417: 2, 5907: 2, 10231: 1, 22811: 1, 8993: 3, 8482: 2, 3361: 2, 16676: 1, 2346: 2, 10046: 2, 1855: 2, 20290: 3, 10564: 3, 8008: 1, 3914: 2, 16205: 2, 11087: 2, 7512: 3, 3417: 2, 7000: 2, 859: 1, 4957: 2, 22880: 3, 13667: 2, 5987: 1, 12645: 1, 1

In [85]:
def recomendador_lugares(df_listings, desc_client,num_threads):
    
    df_parts = split_dataframe(df_listings, num_threads)

    # Crear vectorizador TF-IDF
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df_listings['description'])

    # Crear matriz TF-IDF para el dataframe completo
    tfidf_mtx = np.empty((len(df_listings), vectorizer.vocabulary_.__len__()))

    # Crear hilos para calcular la matriz TF-IDF
    threads = []
    start_idx = 0
    for sub_df in df_parts:
        thread = threading.Thread(target=compute_tfidf, args=(sub_df, vectorizer, tfidf_mtx, start_idx))
        threads.append(thread)
        start_idx += len(sub_df)

    # Iniciar y unir hilos
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

    # Calcular la similitud del coseno
    query_tfidf = vectorizer.transform([desc_client])
    similarities = cosine_similarity(query_tfidf, tfidf_mtx)

    # Obtener los 10 lugares más similares
    top_10_indices = np.argsort(similarities[0])[-10:][::-1]
    top_10_listings = df_listings.iloc[top_10_indices]

    return top_10_listings

In [83]:
desc_client =
num_threads = 4
recomendaciones = recomendador_lugares(df_listings.sample(1000).dropna(), desc_client,num_threads)
recomendaciones

Unnamed: 0,listing_url,description
9350,https://www.airbnb.com/rooms/42177681,"available for any kind of stays (short, medium..."
5611,https://www.airbnb.com/rooms/30697115,"ambiente cómodo, tranquilo chalet mejor zona m..."
8635,https://www.airbnb.com/rooms/40267669,"habitación cama queen 1 litera 2 4 personas, c..."
3960,https://www.airbnb.com/rooms/23753503,"linda recámara servicios (luz, agua, internet)..."
22312,https://www.airbnb.com/rooms/776140661950463677,"relax in this cozy space in condesa, ideal for..."
13797,https://www.airbnb.com/rooms/51900789,"disfruta pequeño, práctico totalmente equipado..."
21264,https://www.airbnb.com/rooms/746676268708326985,quedarte departamento permitirá vivir verdader...
5895,https://www.airbnb.com/rooms/31786783,beautiful and comfortable apartment 2 blocks f...
15894,https://www.airbnb.com/rooms/552963521405241437,departamento 2 recamaras colonia roma<br />ser...
9115,https://www.airbnb.com/rooms/41496693,beautiful spacious apartment home in gated com...
