# Generador de letras de canciones

## Exploración y preprocesamiento del dataset

### Carga inicial del dataset

In [None]:
import pandas as pd

# Cargar dataset
df = pd.read_csv("/kaggle/input/rock-music-dataset/tcc_ceds_music_rock.csv")

# Exploración
print(df.head())
print(df.info())
print(df['lyrics'].iloc[0])


### Limpieza de datos

In [12]:
# Filas con datos incompletos
df = df.dropna()

# Eliminación de caracteres no deseados
df['Lyrics'] = df['lyrics'].str.replace(r'\n', ' ')

### Tokenización y preprocesamiento de texto

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

# Tokenización y limpieza
stop_words = set(stopwords.words('english'))

def preprocess_lyrics(lyrics):
    tokens = word_tokenize(lyrics.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return ' '.join(tokens)

df['Processed_Lyrics'] = df['lyrics'].apply(preprocess_lyrics)


### Análisis exploratorio

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

word_counts = Counter(" ".join(df['Processed_Lyrics']).split())

# Visualización
common_words = word_counts.most_common(20)
words, counts = zip(*common_words)
plt.bar(words, counts)
plt.xticks(rotation=45)
plt.show()

## Modelado con Deep Learning

### Vectorización del texto

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential([
    Embedding(input_dim=5000, output_dim=128),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dense(128, activation='relu'),
    Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

### Crear el modelo de generación

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Processed_Lyrics'])
sequences = tokenizer.texts_to_sequences(df['Processed_Lyrics'])

# Padding
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

### Entrenamiento

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import json

# Crear etiquetas y convertir etiquetas a formato categórico
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]
y = to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

# Dividir los datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

In [None]:
# Guardar el modelo entrenado y el tokenizer
model.save("/kaggle/working/models/song_generator.h5")

tokenizer_config = {
  "word_index": tokenizer.word_index,
  "max_length": max_length
}
with open("/kaggle/working/models/tokenizer_config.json", "w") as f:
  json.dump(tokenizer_config, f)