In [1]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import re
from openai import OpenAI
import ast

# Cargar variables de entorno desde un archivo .env
load_dotenv()

# Importar funciones de gestión
from utils.chat_gpt_manager import (
    get_instruments_chat,
    get_chords_chat
)
from utils.spotify_manager import (
    get_similar_songs,
    get_track_id_spotify,
    get_audio_analysis,
    get_audio_features
)
from utils.functions import (
    add_info_list_df,
    convert_chords_with_minor,
    standardize_chord_symbols,
    transpose_chords_to_key,
    check_track_csv,
    clean_metadata,
    filter_existing_vector_id,
    expand_list_columns,
    to_embbed_text_column, 
    expand_and_remove_original_columns,
    drop_columns_with_many_nulls

)
from utils.lyrics_manager import insert_lyrics_db

from pinecone import Pinecone, ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone

In [2]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENVIRONMENT=  os.getenv('PINECONE_ENVIRONMENT')

# Initialize a client
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "song-starter-index"
dimension = 1536
metric = "cosine"

In [3]:
# Acceder a la lista de índices desde el JSON devuelto
existing_indexes = pc.list_indexes()
indexes = existing_indexes.get('indexes', [])

# Verificar si el índice ya existe
index_exists = any(index.get('name') == index_name for index in indexes)

if not index_exists:
    # Crear el índice si no existe
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"  # Especificar la región deseada
        ),
        deletion_protection="disabled"
    )
    print(f"Index '{index_name}' created successfully.")
else:
    print(f"Index '{index_name}' already exists, skipping creation.")

Index 'song-starter-index' already exists, skipping creation.


In [4]:
index = pc.Index(index_name)

In [5]:
api_key = os.getenv("OPENAI_API_KEY")

# Configurar la clave de API en OpenAI
OpenAI.api_key = api_key

In [6]:
# Convert text into vectors using embeddings
client = OpenAI(
 api_key=api_key,
)

In [7]:
df_csv= pd.read_csv('mi_archivo_temporal.csv')

In [8]:
columnas_listas= ['instruments', 'chords_1', 'chords_2', 'chords_1_numeric', 'chords_2_numeric',
       'normal_chords_1_numeric', 'normal_chords_2_numeric',
       'diff_chords_1_numeric', 'diff_chords_2_numeric']

In [9]:
df_csv=expand_and_remove_original_columns(df_csv,columnas_listas )

In [10]:
df_csv

Unnamed: 0,id,name,artists,speechiness,instrumentalness,key,mode,tempo,loudness,theme,...,diff_chords_1_numeric_1,diff_chords_1_numeric_2,diff_chords_1_numeric_3,diff_chords_2_numeric_1,diff_chords_2_numeric_2,diff_chords_2_numeric_3,diff_chords_2_numeric_4,diff_chords_2_numeric_5,diff_chords_2_numeric_6,diff_chords_2_numeric_7
0,2ISUiwj8xXqHeUFcen0AIU,Grow A Pear,Kesha,0.0508,0.000341,0,1,120.011,-3.891,Last night I had enough of you I put down the ...,...,7,14,-16,2.0,-7.0,21.0,,,,
1,0lUSd7TCG8srh6HpIAEIWL,Power,Little Mix,0.2110,0.000157,7,1,172.966,-4.016,"Hold up, no you didn't bow, bow I ain't the ch...",...,7,2,-4,7.0,2.0,-4.0,,,,
2,1YaVmBh7EAeR54FIjuFcb5,Naturally,Selena Gomez & The Scene,0.0511,0.000001,10,0,132.612,-5.406,"The theme of ""Naturally"" by Selena Gomez & The...",...,7,14,-16,7.0,-2.0,2.0,,,,
3,00ko8Vr9Hbw0hJC9laqBOj,Hannah Montana,flowerovlove,0.0809,0.000000,0,1,114.073,-7.922,"The song ""Hannah Montana"" by flowerovlove expl...",...,7,14,-16,2.0,9.0,-16.0,,,,
4,5iV4zM0LZKiiRoUj2zVWSD,Very Online Guy,Alvvays,0.0670,0.000002,7,1,143.039,-3.766,He's a very online guy He likes to hit reply H...,...,7,14,-16,7.0,-2.0,-5.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656,2fH6AFQ5oId9AW0b4YsZtN,Barakaldo,Lisasinson,0.0312,0.002650,2,0,187.074,-8.196,"The song ""Barakaldo"" by Lisasinson explores th...",...,7,14,-16,7.0,14.0,-16.0,,,,
657,4OTdFcuAndBtoPJTsBIyQi,Lorelai,Fleet Foxes,0.0351,0.029700,2,1,144.023,-7.700,So guess I got old I was like trash on the sid...,...,7,14,-16,7.0,-2.0,-5.0,,,,
658,1iPeDwic3VEt0T75NyVoaG,The Dream Synopsis,The Last Shadow Puppets,0.0374,0.000000,4,1,129.906,-8.611,Well we were kissing It was secret We'd had to...,...,7,14,-16,-4.0,9.0,-19.0,,,,
659,0LxeKwg9t7HOnyfv4bTALT,Suck It and See,Arctic Monkeys,0.0570,0.114000,4,1,128.291,-5.017,Your love is like a studded leather headlock Y...,...,7,14,-16,7.0,14.0,-16.0,,,,


In [11]:
#elimino las columnas que tengan solo valores nulos
df_csv = df_csv.dropna(axis=1, how='all')

In [12]:
df_csv

Unnamed: 0,id,name,artists,speechiness,instrumentalness,key,mode,tempo,loudness,theme,...,diff_chords_1_numeric_1,diff_chords_1_numeric_2,diff_chords_1_numeric_3,diff_chords_2_numeric_1,diff_chords_2_numeric_2,diff_chords_2_numeric_3,diff_chords_2_numeric_4,diff_chords_2_numeric_5,diff_chords_2_numeric_6,diff_chords_2_numeric_7
0,2ISUiwj8xXqHeUFcen0AIU,Grow A Pear,Kesha,0.0508,0.000341,0,1,120.011,-3.891,Last night I had enough of you I put down the ...,...,7,14,-16,2.0,-7.0,21.0,,,,
1,0lUSd7TCG8srh6HpIAEIWL,Power,Little Mix,0.2110,0.000157,7,1,172.966,-4.016,"Hold up, no you didn't bow, bow I ain't the ch...",...,7,2,-4,7.0,2.0,-4.0,,,,
2,1YaVmBh7EAeR54FIjuFcb5,Naturally,Selena Gomez & The Scene,0.0511,0.000001,10,0,132.612,-5.406,"The theme of ""Naturally"" by Selena Gomez & The...",...,7,14,-16,7.0,-2.0,2.0,,,,
3,00ko8Vr9Hbw0hJC9laqBOj,Hannah Montana,flowerovlove,0.0809,0.000000,0,1,114.073,-7.922,"The song ""Hannah Montana"" by flowerovlove expl...",...,7,14,-16,2.0,9.0,-16.0,,,,
4,5iV4zM0LZKiiRoUj2zVWSD,Very Online Guy,Alvvays,0.0670,0.000002,7,1,143.039,-3.766,He's a very online guy He likes to hit reply H...,...,7,14,-16,7.0,-2.0,-5.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656,2fH6AFQ5oId9AW0b4YsZtN,Barakaldo,Lisasinson,0.0312,0.002650,2,0,187.074,-8.196,"The song ""Barakaldo"" by Lisasinson explores th...",...,7,14,-16,7.0,14.0,-16.0,,,,
657,4OTdFcuAndBtoPJTsBIyQi,Lorelai,Fleet Foxes,0.0351,0.029700,2,1,144.023,-7.700,So guess I got old I was like trash on the sid...,...,7,14,-16,7.0,-2.0,-5.0,,,,
658,1iPeDwic3VEt0T75NyVoaG,The Dream Synopsis,The Last Shadow Puppets,0.0374,0.000000,4,1,129.906,-8.611,Well we were kissing It was secret We'd had to...,...,7,14,-16,-4.0,9.0,-19.0,,,,
659,0LxeKwg9t7HOnyfv4bTALT,Suck It and See,Arctic Monkeys,0.0570,0.114000,4,1,128.291,-5.017,Your love is like a studded leather headlock Y...,...,7,14,-16,7.0,14.0,-16.0,,,,


In [13]:
df_csv.isnull().sum()

id                             0
name                           0
artists                        0
speechiness                    0
instrumentalness               0
key                            0
mode                           0
tempo                          0
loudness                       0
theme                          0
instruments_1                  0
instruments_2                  0
chords_1_1                     0
chords_1_2                     0
chords_1_3                     0
chords_1_4                     0
chords_2_1                     4
chords_2_2                     4
chords_2_3                     4
chords_2_4                     4
chords_2_5                   659
chords_2_6                   659
chords_2_7                   659
chords_2_8                   659
chords_1_numeric_1             0
chords_1_numeric_2             0
chords_1_numeric_3             0
chords_1_numeric_4             0
chords_2_numeric_1             4
chords_2_numeric_2             4
chords_2_n

In [14]:

# Eliminar columnas con más de 600 valores nulos
df_csv = drop_columns_with_many_nulls(df_csv, 600)

df_csv.isnull().sum()

id                           0
name                         0
artists                      0
speechiness                  0
instrumentalness             0
key                          0
mode                         0
tempo                        0
loudness                     0
theme                        0
instruments_1                0
instruments_2                0
chords_1_1                   0
chords_1_2                   0
chords_1_3                   0
chords_1_4                   0
chords_2_1                   4
chords_2_2                   4
chords_2_3                   4
chords_2_4                   4
chords_1_numeric_1           0
chords_1_numeric_2           0
chords_1_numeric_3           0
chords_1_numeric_4           0
chords_2_numeric_1           4
chords_2_numeric_2           4
chords_2_numeric_3           4
chords_2_numeric_4           4
normal_chords_1_numeric_1    0
normal_chords_1_numeric_2    0
normal_chords_1_numeric_3    0
normal_chords_1_numeric_4    0
normal_c

In [15]:
#elimino filas con valores nulos
df_csv = df_csv.dropna()

In [16]:
df_csv.isnull().sum()

id                           0
name                         0
artists                      0
speechiness                  0
instrumentalness             0
key                          0
mode                         0
tempo                        0
loudness                     0
theme                        0
instruments_1                0
instruments_2                0
chords_1_1                   0
chords_1_2                   0
chords_1_3                   0
chords_1_4                   0
chords_2_1                   0
chords_2_2                   0
chords_2_3                   0
chords_2_4                   0
chords_1_numeric_1           0
chords_1_numeric_2           0
chords_1_numeric_3           0
chords_1_numeric_4           0
chords_2_numeric_1           0
chords_2_numeric_2           0
chords_2_numeric_3           0
chords_2_numeric_4           0
normal_chords_1_numeric_1    0
normal_chords_1_numeric_2    0
normal_chords_1_numeric_3    0
normal_chords_1_numeric_4    0
normal_c

In [17]:
df_csv

Unnamed: 0,id,name,artists,speechiness,instrumentalness,key,mode,tempo,loudness,theme,...,normal_chords_2_numeric_1,normal_chords_2_numeric_2,normal_chords_2_numeric_3,normal_chords_2_numeric_4,diff_chords_1_numeric_1,diff_chords_1_numeric_2,diff_chords_1_numeric_3,diff_chords_2_numeric_1,diff_chords_2_numeric_2,diff_chords_2_numeric_3
0,2ISUiwj8xXqHeUFcen0AIU,Grow A Pear,Kesha,0.0508,0.000341,0,1,120.011,-3.891,Last night I had enough of you I put down the ...,...,5.0,7.0,0.0,21.0,7,14,-16,2.0,-7.0,21.0
1,0lUSd7TCG8srh6HpIAEIWL,Power,Little Mix,0.2110,0.000157,7,1,172.966,-4.016,"Hold up, no you didn't bow, bow I ain't the ch...",...,-3.0,4.0,6.0,2.0,7,2,-4,7.0,2.0,-4.0
2,1YaVmBh7EAeR54FIjuFcb5,Naturally,Selena Gomez & The Scene,0.0511,0.000001,10,0,132.612,-5.406,"The theme of ""Naturally"" by Selena Gomez & The...",...,-10.0,-3.0,-5.0,-3.0,7,14,-16,7.0,-2.0,2.0
3,00ko8Vr9Hbw0hJC9laqBOj,Hannah Montana,flowerovlove,0.0809,0.000000,0,1,114.073,-7.922,"The song ""Hannah Montana"" by flowerovlove expl...",...,5.0,7.0,16.0,0.0,7,14,-16,2.0,9.0,-16.0
4,5iV4zM0LZKiiRoUj2zVWSD,Very Online Guy,Alvvays,0.0670,0.000002,7,1,143.039,-3.766,He's a very online guy He likes to hit reply H...,...,-7.0,0.0,-2.0,-7.0,7,14,-16,7.0,-2.0,-5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656,2fH6AFQ5oId9AW0b4YsZtN,Barakaldo,Lisasinson,0.0312,0.002650,2,0,187.074,-8.196,"The song ""Barakaldo"" by Lisasinson explores th...",...,-2.0,5.0,19.0,3.0,7,14,-16,7.0,14.0,-16.0
657,4OTdFcuAndBtoPJTsBIyQi,Lorelai,Fleet Foxes,0.0351,0.029700,2,1,144.023,-7.700,So guess I got old I was like trash on the sid...,...,-2.0,5.0,3.0,-2.0,7,14,-16,7.0,-2.0,-5.0
658,1iPeDwic3VEt0T75NyVoaG,The Dream Synopsis,The Last Shadow Puppets,0.0374,0.000000,4,1,129.906,-8.611,Well we were kissing It was secret We'd had to...,...,10.0,6.0,15.0,-4.0,7,14,-16,-4.0,9.0,-19.0
659,0LxeKwg9t7HOnyfv4bTALT,Suck It and See,Arctic Monkeys,0.0570,0.114000,4,1,128.291,-5.017,Your love is like a studded leather headlock Y...,...,-4.0,3.0,17.0,1.0,7,14,-16,7.0,14.0,-16.0


In [18]:
to_embbed_text_column(df_csv)

Unnamed: 0,id,name,artists,speechiness,instrumentalness,key,mode,tempo,loudness,theme,...,normal_chords_2_numeric_2,normal_chords_2_numeric_3,normal_chords_2_numeric_4,diff_chords_1_numeric_1,diff_chords_1_numeric_2,diff_chords_1_numeric_3,diff_chords_2_numeric_1,diff_chords_2_numeric_2,diff_chords_2_numeric_3,to_embbed_text
0,2ISUiwj8xXqHeUFcen0AIU,Grow A Pear,Kesha,0.0508,0.000341,0,1,120.011,-3.891,Last night I had enough of you I put down the ...,...,7.0,0.0,21.0,7,14,-16,2.0,-7.0,21.0,name: Grow A Pear artists: Kesha speechiness: ...
1,0lUSd7TCG8srh6HpIAEIWL,Power,Little Mix,0.2110,0.000157,7,1,172.966,-4.016,"Hold up, no you didn't bow, bow I ain't the ch...",...,4.0,6.0,2.0,7,2,-4,7.0,2.0,-4.0,name: Power artists: Little Mix speechiness: 0...
2,1YaVmBh7EAeR54FIjuFcb5,Naturally,Selena Gomez & The Scene,0.0511,0.000001,10,0,132.612,-5.406,"The theme of ""Naturally"" by Selena Gomez & The...",...,-3.0,-5.0,-3.0,7,14,-16,7.0,-2.0,2.0,name: Naturally artists: Selena Gomez & The Sc...
3,00ko8Vr9Hbw0hJC9laqBOj,Hannah Montana,flowerovlove,0.0809,0.000000,0,1,114.073,-7.922,"The song ""Hannah Montana"" by flowerovlove expl...",...,7.0,16.0,0.0,7,14,-16,2.0,9.0,-16.0,name: Hannah Montana artists: flowerovlove spe...
4,5iV4zM0LZKiiRoUj2zVWSD,Very Online Guy,Alvvays,0.0670,0.000002,7,1,143.039,-3.766,He's a very online guy He likes to hit reply H...,...,0.0,-2.0,-7.0,7,14,-16,7.0,-2.0,-5.0,name: Very Online Guy artists: Alvvays speechi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656,2fH6AFQ5oId9AW0b4YsZtN,Barakaldo,Lisasinson,0.0312,0.002650,2,0,187.074,-8.196,"The song ""Barakaldo"" by Lisasinson explores th...",...,5.0,19.0,3.0,7,14,-16,7.0,14.0,-16.0,name: Barakaldo artists: Lisasinson speechines...
657,4OTdFcuAndBtoPJTsBIyQi,Lorelai,Fleet Foxes,0.0351,0.029700,2,1,144.023,-7.700,So guess I got old I was like trash on the sid...,...,5.0,3.0,-2.0,7,14,-16,7.0,-2.0,-5.0,name: Lorelai artists: Fleet Foxes speechiness...
658,1iPeDwic3VEt0T75NyVoaG,The Dream Synopsis,The Last Shadow Puppets,0.0374,0.000000,4,1,129.906,-8.611,Well we were kissing It was secret We'd had to...,...,6.0,15.0,-4.0,7,14,-16,-4.0,9.0,-19.0,name: The Dream Synopsis artists: The Last Sha...
659,0LxeKwg9t7HOnyfv4bTALT,Suck It and See,Arctic Monkeys,0.0570,0.114000,4,1,128.291,-5.017,Your love is like a studded leather headlock Y...,...,3.0,17.0,1.0,7,14,-16,7.0,14.0,-16.0,name: Suck It and See artists: Arctic Monkeys ...


In [19]:
#para cada fila del dataframe, sacar el vector de embedding de la columna "to_embbed_text"
df_csv['embedding'] = df_csv['to_embbed_text'].apply(lambda x: client.embeddings.create(model="text-embedding-3-small",input=x).data[0].embedding)

In [20]:
df_csv.drop(columns=['to_embbed_text'], inplace=True)

In [21]:
df_vectorial_expanded = df_csv.rename(columns={'embedding': 'values'})

In [22]:
# Crear un diccionario de metadatos con las columnas restantes
metadata_columns = [col for col in df_vectorial_expanded.columns if col not in ['id', 'values']]
df_vectorial_expanded['metadata'] = df_vectorial_expanded[metadata_columns].apply(lambda x: x.to_dict(), axis=1)

# Aplicar la función de limpieza
df_vectorial_expanded['metadata'] = df_vectorial_expanded['metadata'].apply(clean_metadata)

# Crear un nuevo DataFrame con la estructura correcta
df_para_pinecone = pd.DataFrame({
    'id': df_vectorial_expanded['id'],
    'values': df_vectorial_expanded['values'],
    'metadata': df_vectorial_expanded['metadata']
})

# Insertar en Pinecone en lotes de 100
batch_size = 100
for i in range(0, len(df_para_pinecone), batch_size):
    batch = df_para_pinecone.iloc[i:i + batch_size]
    
    # Verificar si el lote no está vacío
    if not batch.empty:
        index.upsert_from_dataframe(batch)

sending upsert requests:   0%|          | 0/100 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/100 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/100 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/100 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/100 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/100 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1 [00:00<?, ?it/s]

sending upsert requests:   0%|          | 0/57 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1 [00:00<?, ?it/s]