In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset, Dataset

import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report



In [2]:
# Load train and test data
train_data = pd.read_csv("../data/train_data.csv")
test_data = pd.read_csv("../data/test_data.csv")

print(train_data.head())

# Split train and test data into features and targets
train_features = train_data["lyrics"]
train_targets = train_data["most_common_genre"]

test_features = test_data["lyrics"]
test_targets = test_data["most_common_genre"]

                       id                                             lyrics  \
0  1FAmKoufyAXMfzPPs9bsjA  i tied my bandana took my pack from the floor ...   
1  3QvPEv8XjHa73iYhaienWw  i want to live on the moon never see a human a...   
2  5VPFATm85G3P04Q5g8yxqr  bitch you know you can t parallel park anyway ...   
3  7J2jCftItt7htcOUdcMnpt  graceless falling slipping in the cold with no...   
4  4cBPzVIbDIQx0LIyauFAy0  madame morse estate stood five hundred years p...   

       artist_name most_common_genre  \
0  Waylon Jennings           country   
1   Phantom Planet              rock   
2    Isaiah Rashad           hip-hop   
3     Matt Pond PA             indie   
4       Ariel Pink               pop   

                                          genre_list  
0  ['country', 'country', 'rock', 'outlaw', 'coun...  
1                                    ['pop', 'rock']  
2  ['hip-hop', 'rap', 'tennessee', 'hip-hop', 'un...  
3                                ['philly', 'indie']  
4  

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(train_features)

x_k_train = tokenizer.texts_to_sequences(train_features)
x_k_test = tokenizer.texts_to_sequences(test_features)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

categories=list(train_targets.unique())

#print(X_train[2])
print(x_k_train[2])

maxlen = 200

x_k_train = pad_sequences(x_k_train, padding='post', maxlen=maxlen)
x_k_test = pad_sequences(x_k_test, padding='post', maxlen=maxlen)

[151, 3, 28, 3, 22, 10, 7580, 1203, 852, 17, 22, 10, 23, 36, 724, 50, 27, 17, 22, 23, 2, 1416, 58, 34, 128, 132, 15, 20, 62, 9, 1182, 252, 1, 16, 62, 36, 724, 50, 58, 41, 253, 45, 3, 220, 4, 41, 253, 211, 3, 42, 293, 293, 293, 17, 22, 10, 23, 36, 724, 50, 27, 17, 22, 23, 2, 1416, 58, 34, 128, 132, 15, 20, 62, 9, 1182, 252, 1, 16, 62, 36, 724, 50, 58, 41, 253, 45, 3, 220, 4, 41, 253, 211, 3, 42, 293, 293, 293, 65, 37, 9, 595, 1081, 900, 4, 9, 976, 51, 6, 172, 50, 139, 51, 6, 172, 50, 403, 3, 1683, 1185, 92, 8, 76, 3, 84, 10627, 71, 34, 11, 18, 1327, 40, 7, 76, 3, 170, 10627, 71, 34, 11, 18, 217, 1, 80, 178, 98, 6, 371, 1, 22, 10, 45, 31, 15, 9, 226, 35, 36, 209, 5, 49, 4, 13, 11, 6, 143, 50, 162, 41, 253, 33, 3, 37, 3, 383, 41, 119, 10, 410, 3, 37, 3, 167, 46, 3, 251, 21, 6, 861, 13, 11, 2, 220, 50, 27, 17, 22, 23, 2, 1416, 58, 34, 128, 132, 15, 20, 62, 9, 1182, 252, 1, 16, 62, 36, 724, 50, 58, 41, 253, 45, 3, 220, 4, 41, 253, 211, 3, 42, 293, 293, 293, 17, 22, 10, 23, 36, 724, 50, 27, 

In [4]:
# Convert the data into PyTorch tensors
train_features_tensor = torch.tensor(train_features.values, dtype=torch.float32)
train_targets_tensor = torch.tensor(train_targets.values, dtype=torch.long)

test_features_tensor = torch.tensor(test_features.values, dtype=torch.float32)
test_targets_tensor = torch.tensor(test_targets.values, dtype=torch.long)

# Combine the features and targets into PyTorch datasets
train_dataset = TensorDataset(train_features_tensor, train_targets_tensor)
test_dataset = TensorDataset(test_features_tensor, test_targets_tensor)

# Create PyTorch dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [5]:
import numpy as np
import tensorflow as tf
from   tensorflow import keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence
# https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_features = 10000
sequence_length = 250

y_k_train = pd.get_dummies(train_targets)
y_k_test = pd.get_dummies(test_targets)

#initializer ='random_uniform'
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 50))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(1024, kernel_initializer='glorot_uniform', activation=tf.nn.relu))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense( len(categories), activation=tf.nn.sigmoid))
model.add(keras.layers.Softmax())
model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(x_k_train, y_k_train, epochs=120, batch_size=512, validation_data=(x_k_test, y_k_test), verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          2881300   
                                                                 
 dropout (Dropout)           (None, None, 50)          0         
                                                                 
 global_average_pooling1d (G  (None, 50)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1024)              52224     
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 10)                10250     
                                                        

KeyboardInterrupt: 

In [None]:
#initializer ='random_uniform'
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 50, input_length=200))
model.add(keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(keras.layers.MaxPooling1D(pool_size=2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(1024, kernel_initializer='glorot_uniform', activation=tf.nn.relu))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense( len(categories), activation=tf.nn.sigmoid))
model.add(keras.layers.Softmax())
model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(x_k_train, y_k_train, epochs=120, batch_size=512, validation_data=(x_k_test, y_k_test), verbose=1)

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 200, 50)           2881300   
                                                                 
 conv1d_2 (Conv1D)           (None, 196, 128)          32128     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 98, 128)          0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 12544)             0         
                                                                 
 dropout_12 (Dropout)        (None, 12544)             0         
                                                                 
 dense_25 (Dense)            (None, 1024)              12846080  
                                                     

In [7]:
from sentence_transformers import SentenceTransformer
transformer_model = SentenceTransformer('brunokreiner/lyrics-bert')

Downloading:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/84.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/383k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\bruno/.cache\\torch\\sentence_transformers\\brunokreiner_lyrics-bert\\1_Pooling\\config.json'

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("brunokreiner/lyrics-bert")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\bruno/.cache\\torch\\sentence_transformers\\brunokreiner_lyrics-bert\\./1_Pooling\\config.json'

In [None]:
x_k_train = tokenizer.texts_to_sequences(train_features)
x_k_test = tokenizer.texts_to_sequences(test_features)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

categories=list(train_targets.unique())

#print(X_train[2])
print(x_k_train[2])

maxlen = 200

x_k_train = pad_sequences(x_k_train, padding='post', maxlen=maxlen)
x_k_test = pad_sequences(x_k_test, padding='post', maxlen=maxlen)

#initializer ='random_uniform'
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 50, input_length=200))
model.add(keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(keras.layers.MaxPooling1D(pool_size=2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(1024, kernel_initializer='glorot_uniform', activation=tf.nn.relu))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense( len(categories), activation=tf.nn.sigmoid))
model.add(keras.layers.Softmax())
model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(x_k_train, y_k_train, epochs=120, batch_size=512, validation_data=(x_k_test, y_k_test), verbose=1)

In [None]:
# Define your PyTorch models here
models = [MyModel1(), MyModel2(), MyModel3()]

# Define your loss function
criterion = nn.CrossEntropyLoss()

# Create a list to store the optimizer for each model
optimizers = [optim.Adam(model.parameters(), lr=0.001) for model in models]

# Create a dataframe to store the model accuracy, train loss, and test loss
df = pd.DataFrame(columns=['model', 'epoch', 'accuracy', 'train_loss', 'test_loss'])

# Train your models
num_epochs = 10

for model_idx, model in enumerate(models):
    for epoch in range(num_epochs):
        # Train the model
        model.train()
        train_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizers[model_idx].zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizers[model_idx].step()
            train_loss += loss.item() * inputs.size(0)

        # Evaluate the model
        model.eval()
        total_correct = 0
        total_samples = 0
        test_loss = 0.0
        with torch.no_grad():
            for inputs, labels in test_dataloader:
                outputs = model(inputs)
                _, predictions = torch.max(outputs, 1)
                total_correct += (predictions == labels).sum().item()
                total_samples += labels.size(0)
                loss = criterion(outputs, labels)
                test_loss += loss.item() * inputs.size(0)
        accuracy = total_correct / total_samples
        train_loss = train_loss / len(train_dataloader.dataset)
        test_loss = test_loss / len(test_dataloader.dataset)
        print(f"Model {model_idx+1} - Epoch {epoch+1} Accuracy: {accuracy:.4f} Train Loss: {train_loss:.4f} Test Loss: {test_loss:.4f}")
        
        # Add the accuracy, train loss, and test loss to the dataframe
        df = df.append({'model': f'model{model_idx+1}', 'epoch': epoch+1, 'accuracy': accuracy, 'train_loss': train_loss, 'test_loss': test_loss}, ignore_index=True)

    # Save the dataframe as a CSV file for each model
    df.to_csv(f'model{model_idx+1}_accuracy.csv', index=False)
    df = df.iloc[0:0]

    # Generate the confusion matrix for the model on the test data
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            outputs = model(inputs)
            _, predictions = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    print(f"Model {model_idx+1} Confusion Matrix:")
    print(cm)
    print(f"Classification Report:")
    print(report)

NameError: name 'MyModel1' is not defined