<a href="https://colab.research.google.com/github/DevDope/Abracadabra/blob/main/NEMO_curator_and_JSON_with_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nemo_toolkit[all]
!pip install pynini
!pip install git+https://github.com/NVIDIA/NeMo-text-processing.git

In [None]:
!pip install llama-index
!pip install llama-index-readers-web
!pip install llama-index-readers-file
!pip install llama-index-llms-huggingface
!pip install llama-index-llms-huggingface-api
!pip install llama-index-embeddings-huggingface
!pip uninstall transformers -y
!pip uninstall sentence-transformers -y
!pip install transformers==4.40.0
!pip install sentence-transformers
!pip install nemo_text_processing





In [None]:
!pip show llama-index

In [None]:
import json


from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os
from tqdm import tqdm
from nemo_text_processing.text_normalization.normalize import Normalizer
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer


file_path = '/content/drive/MyDrive/final_milliondataset_BERT_500K_cleaned.json'


curated_file_path = '/content/drive/MyDrive/final_milliondataset_curated2.json'


batch_size = 10000
total_lines = sum(1 for _ in open(file_path, 'r', encoding='utf-8'))


def curar_datos_con_nemo(batch):

    normalizer = Normalizer(input_case='lower_cased')
    inverse_normalizer = InverseNormalizer(input_case='lower_cased')

    curated_batch = []
    for _, data in batch.iterrows():

        data['song'] = normalizer.normalize(data.get('song', '') or '')
        data['Artist(s)'] = normalizer.normalize(data.get('Artist(s)', '') or '')
        data['Release Date'] = inverse_normalizer.inverse_normalize(data.get('Release Date', '') or '', verbose=False)


        curated_batch.append(data)

    return pd.DataFrame(curated_batch)


with pd.read_json(file_path, lines=True, chunksize=batch_size) as reader:
    with open(curated_file_path, 'w', encoding='utf-8') as f_out:
        for batch_num, batch in enumerate(tqdm(reader, total=total_lines // batch_size, desc="Procesando dataset")):


            batch_curated = curar_datos_con_nemo(batch)


            batch_curated.to_json(f_out, orient='records', lines=True, force_ascii=False)


            print(f"Lote {batch_num + 1} de {total_lines // batch_size} procesado y guardado.")

print(f"Curación completa. Los datos han sido guardados en {curated_file_path}")



In [None]:
import os
import json
from tqdm import tqdm
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


base_dir = "/content/drive/MyDrive/llama_index2"
document_path = "/content/drive/MyDrive/final_milliondataset_curated.json"
output_json_path = "/content/drive/MyDrive/final_milliondataset_with_embeddings2.json"


embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")

documents_with_embeddings = []


with open(document_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Generando embeddings", unit=" documento"):
        doc_data = json.loads(line)


        song = doc_data.get('song', '')
        artist = doc_data.get('Artist(s)', '')
        text = doc_data.get('text', '')[:500]
        emotion = doc_data.get('emotion', '')


        doc_text = f"Song: {song} by {artist}. Lyrics: {text}. Emotion: {emotion}."


        embedding = embed_model.get_text_embedding(doc_text)


        doc_data['embedding'] = embedding


        documents_with_embeddings.append(doc_data)


with open(output_json_path, 'w', encoding='utf-8') as f_out:
    for doc in documents_with_embeddings:
        f_out.write(json.dumps(doc) + '\n')

print(f"Embeddings generados y guardados en {output_json_path}")

