In [79]:
import os
import nltk
import heapq
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from collections import defaultdict, Counter
from math import log, sqrt

# Descargar los recursos necesarios de NLTK
nltk.download('punkt')
nltk.download('stopwords')

# Inicializar el stemmer y las stopwords
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words("english"))

def tokenize(text):
    """ Tokeniza, elimina stopwords y aplica stemming a las palabras """
    tokens = word_tokenize(text.lower())
    filtered_tokens = [stemmer.stem(token) for token in tokens if token.isalpha() and token not in stop_words]
    return filtered_tokens

def compute_tf(tokens):
    """ Calcula la frecuencia de término (TF) """
    tf = Counter(tokens)
    total_terms = len(tokens)
    for term in tf:
        tf[term] /= total_terms
    return tf

def compute_idf(documents):
    """ Calcula la frecuencia inversa de documentos (IDF) """
    N = len(documents)
    df = defaultdict(int)
    for doc in documents:
        unique_terms = set(doc)
        for term in unique_terms:
            df[term] += 1
    idf = {term: log(N / df[term]) for term in df}
    return idf

def compute_norm(tf_idf):
    """ Calcula la norma de un documento """
    return sqrt(sum(weight**2 for weight in tf_idf.values()))

def spimi_invert(documents, block_size):
    """ Construye el índice invertido usando SPIMI """
    block_id = 0
    term_dict = defaultdict(lambda: defaultdict(float))
    doc_norms = {}
    idf = compute_idf(documents)
    
    for doc_id, tokens in enumerate(documents):
        tf = compute_tf(tokens)
        tf_idf = {term: tf[term] * idf[term] for term in tf}
        doc_norms[doc_id] = compute_norm(tf_idf)
        
        for term, weight in tf_idf.items():
            term_dict[term][doc_id] = weight
        
        # Si el bloque excede el tamaño permitido, escribir a disco y limpiar
        if len(term_dict) >= block_size:
            write_block_to_disk(term_dict, block_id)
            block_id += 1
            term_dict.clear()
    
    if term_dict:  # Escribir el último bloque si queda algo
        write_block_to_disk(term_dict, block_id)
        block_id += 1
    
    return doc_norms, block_id

def write_block_to_disk(term_dict, block_id):
    """ Escribe un bloque de índice invertido a disco """
    filename = f'block_{block_id}.txt'
    with open(filename, 'w', encoding='utf-8') as f:
        for term, postings in sorted(term_dict.items()):
            postings_list = ' '.join(f"{doc_id}:{weight}" for doc_id, weight in postings.items())
            f.write(f"{term}: {postings_list}\n")
    print(f"Escrito {filename}")

def merge_blocks(block_count, output_file):
    """ Fusiona bloques de índices invertidos en un solo archivo """
    heap = []
    file_pointers = [open(f'block_{i}.txt', 'r', encoding='utf-8') for i in range(block_count) if os.path.exists(f'block_{i}.txt')]
    for i, fp in enumerate(file_pointers):
        term, postings = read_next_line(fp)
        if term:
            heapq.heappush(heap, (term, postings, i))
    
    with open(output_file, 'w', encoding='utf-8') as f:
        while heap:
            term, postings, i = heapq.heappop(heap)
            f.write(f"{term}: {postings}\n")
            next_line = file_pointers[i].readline().strip()
            if next_line:
                next_term, next_postings = next_line.split(': ', 1)
                heapq.heappush(heap, (next_term, next_postings, i))
    
    for fp in file_pointers:
        fp.close()
    print(f"Índice invertido final escrito en {output_file}")

def read_next_line(file_pointer):
    """ Lee la siguiente línea de un archivo y retorna el término y sus postings """
    line = file_pointer.readline().strip()
    if line:
        term, postings = line.split(': ', 1)
        return term, postings
    return None, None

def cosine_similarity(query_vector, doc_vector, doc_norm):
    """ Calcula la similitud de coseno entre dos vectores """
    dot_product = sum(query_vector[term] * doc_vector.get(term, 0) for term in query_vector)
    return dot_product / doc_norm if doc_norm else 0.0

def process_query(query, idf, doc_norms, inverted_index, data, top_k=15):
    """ Procesa una consulta y retorna los top-k documentos relevantes con metadata """
    tokens = tokenize(query)
    tf_query = compute_tf(tokens)
    tf_idf_query = {term: tf_query[term] * idf.get(term, 0) for term in tf_query}
    query_norm = compute_norm(tf_idf_query)
    
    scores = defaultdict(float)
    
    for term, query_weight in tf_idf_query.items():
        if term in inverted_index:
            for doc_id, doc_weight in inverted_index[term].items():
                scores[doc_id] += query_weight * doc_weight
    
    ranked_scores = sorted(((score / (doc_norms[doc_id] * query_norm), doc_id) 
                            for doc_id, score in scores.items()), reverse=True)
    
    results = []
    for score, doc_id in ranked_scores[:top_k]:
        song_info = {
            'score': score,
            'track_name': data.iloc[doc_id]['track_name'],
            'track_artist': data.iloc[doc_id]['track_artist'],
            'track_album_name': data.iloc[doc_id]['track_album_name']
        }
        results.append(song_info)
    
    return results

# Ejemplo de uso
# Leer dataset desde un archivo CSV
file_path = 'spotify_songs.csv'  # Reemplaza con la ruta de tu archivo CSV
data = pd.read_csv(file_path)

# Concatenar los campos textuales
data['text'] = data['lyrics']

# Preprocesar los documentos
documents = [tokenize(text) for text in data['text'].dropna()]
block_size = 50000  # Ajusta el tamaño del bloque según la memoria disponible

# Construcción del índice invertido
doc_norms, total_blocks = spimi_invert(documents, block_size)
merge_blocks(total_blocks, 'final_inverted_index.txt')

# Crear un índice invertido en memoria secundaria para la consulta
inverted_index = defaultdict(dict)
with open('final_inverted_index.txt', 'r', encoding='utf-8') as f:
    for line in f:
        term, postings = line.split(': ', 1)
        for posting in postings.split():
            doc_id, weight = posting.split(':')
            inverted_index[term][int(doc_id)] = float(weight)

# Proceso de consulta
query = "Oh, thinkin' about our younger years There was only you and me We were young and wild and free Now nothin' can take you away from me We've been down that road before But that's over now You keep me comin' back for more Baby, you're all that I want When you're lyin' here in my arms I'm findin' it hard to believe We're in Heaven And love is all that I need And I found it there in your heart It isn't too hard to see We're in Heaven Oh, once in your life you find someone Who will turn your world around Bring you up when you're feelin' down Yeah, nothin' could change what you mean to me Oh, there's lots that I could say But just hold me now Cause our love will light the way And baby you're all that I want When you're lyin' here in my arms I'm finding it hard to believe We're in Heaven Yeah, love is all that I need And I found it there in your heart It isn't too hard to see We're in Heaven, yeah I've been waitin' for so long For somethin' to arrive For love to come along Now our dreams are comin' true Through the good times and the bad Yeah, I'll be standin' there by you And baby you're all that I want When you're lyin' here in my arms I'm findin' it hard to believe We're in Heaven And love is all that I need And I found it there in your heart It isn't too hard to see We're in Heaven, Heaven, woah You're all that I want You're all that I need"
idf = compute_idf(documents)
top_k_results = process_query(query, idf, doc_norms, inverted_index, data)

print("Top-k documentos relevantes:")
for result in top_k_results:
    print(f"Score: {result['score']}, Track Name: {result['track_name']}, Artist: {result['track_artist']}, Album: {result['track_album_name']}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vilch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vilch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Escrito block_0.txt
Escrito block_1.txt
Escrito block_2.txt
Índice invertido final escrito en final_inverted_index.txt
Top-k documentos relevantes:
Score: 1.0, Track Name: Lost In The Rhythm - Original Mix, Artist: Jamie Berry, Album: Lost In The Rhythm
Score: 1.0, Track Name: I'll Whip Ya Head Boy, Artist: 50 Cent, Album: Window Shopper (International Version)
Score: 0.6655957411034531, Track Name: Cannonball, Artist: Damien Rice, Album: O
Score: 0.533389160947312, Track Name: Parking Lot, Artist: Anderson .Paak, Album: Malibu
Score: 0.516429629599365, Track Name: Rolling Stone, Artist: Ecko, Album: Rolling Stone
Score: 0.454531579210013, Track Name: Sunshine Of Your Love, Artist: Cream, Album: Disraeli Gears (Deluxe Edition)
Score: 0.4173687240152495, Track Name: Schmeckt (feat. LX), Artist: Hemso, Album: Schmeckt (feat. LX)
Score: 0.4074020475499432, Track Name: Glad You Came, Artist: The Wanted, Album: Battleground (Deluxe Edition)
Score: 0.4026404291827133, Track Name: Big Girls D

Documentos tokenizados:
Documento 0: ['minsan', 'pa', 'nang', 'ako', 'napalingon', 'hindi', 'ko', 'alam', 'na', 'ika', 'tutugon', 'sa', 'mga', 'tanong', 'na', 'ake', 'nabitawan', 'hindi', 'ko', 'alam', 'kung', 'ito', 'totoo', 'pangarap', 'ka', 'sa', 'bawat', 'sandali', 'langit', 'man', 'ang', 'tingin', 'ko', 'sayo', 'sana', 'marat', 'hanggang', 'dito', 'na', 'lang', 'yata', 'ang', 'kaya', 'kong', 'gawin', 'mangarap', 'na', 'lang', 'bumulong', 'sa', 'hangin', 'kailan', 'kaya', 'darat', 'ulit', 'ang', 'isang', 'sandali', 'na', 'ako', 'lilingon', 'muli', 'pangarap', 'ka', 'tinig', 'mong', 'kay', 'lamig', 'ang', 'iyong', 'mga', 'ngiti', 'na', 'sa', 'akin', 'ay', 'nakapagbigay', 'pansin', 'ikaw', 'ba', 'ay', 'isang', 'pangarap', 'lang', 'pangarap', 'ka', 'tinig', 'mong', 'kay', 'lamig', 'ang', 'iyong', 'mga', 'ngiti', 'na', 'sa', 'akin', 'ay', 'nakapagbigay', 'pangarap', 'ka', 'tinig', 'mong', 'kay', 'lamig', 'ang', 'iyong', 'mga', 'ngiti', 'na', 'sa', 'akin', 'ay', 'nakapagbigay', 'pangara

NameError: name 'tf_idf_query' is not defined