In [None]:
# Imports
import pandas as pd
import re
import nltk
import numpy as np
import torch
import gensim
from gensim.models import Word2Vec
import ast

In [None]:
vector_embeddings_df = pd.read_csv('master_dataset.csv')
vector_embeddings_df['tokens'] = vector_embeddings_df['tokens'].apply(ast.literal_eval)

In [None]:
word2vec_model = Word2Vec(sentences = vector_embeddings_df['tokens'], vector_size = 50, window = 5, min_count = 1, workers = 4)

In [None]:
def apply_word_embeddings(tokens):
    embeddings = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    embeddings_array = np.array(embeddings) # UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor.
    return torch.tensor(embeddings_array)

vector_embeddings_df['embeddings'] = vector_embeddings_df['tokens'].apply(apply_word_embeddings)

In [None]:
from torch.nn.utils.rnn import pad_sequence
vectors = vector_embeddings_df['embeddings'].tolist()
torch_padded_tensor = pad_sequence([torch.FloatTensor(np.array(vector)) for vector in vectors], batch_first = True) # Convert each list of embeddings to a FloatTensor and pad them
print(torch_padded_tensor.shape)

In [None]:
def calculate_mean(tensor):
  return torch.mean(tensor, dim = 0).numpy()

mean_embeddings = vector_embeddings_df['embeddings'].apply(calculate_mean)
mean_embeddings_df = pd.DataFrame(mean_embeddings.tolist(), columns=[f'embedding_{i}' for i in range(mean_embeddings.iloc[0].shape[0])])
mean_embeddings_df.head()

In [None]:
vector_embeddings_df['cleaned_lyrics'] = vector_embeddings_df['tokens'].apply(lambda x: ' '.join(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features = 84)

tfidf_matrix = tfidf_vectorizer.fit_transform(vector_embeddings_df['cleaned_text']) # Fit and transform the text data to get the TF-IDF matrix

In [None]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.head()

In [None]:
word_freq_counts = tfidf_matrix.sum(axis = 0) # Sum the TF-IDF values across all documents for each word
feature_names = tfidf_vectorizer.get_feature_names_out() # Get the feature names (words)
word_freq_df = pd.DataFrame({'Word': feature_names, 'Frequency': np.squeeze(np.asarray(word_freq_counts))})
word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False) # Frequency in descending order
print(word_freq_df)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.bar(word_freq_df['Word'], word_freq_df['Frequency'], color='skyblue')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.title('Word Frequency Counts')
plt.xticks(rotation=90)
plt.show()

In [None]:
combined_features = pd.concat([vector_embeddings_df, tfidf_df, mean_embeddings_df], axis=1)
mood_list = combined_features['mood']

In [None]:
columns_to_drop = ['track_names', 'first_artists', 'lyrics', 'tokens', 'mood','embeddings', 'cleaned_text', 'acousticness', 'mode', 'instrumentalness']
combined_features = combined_features.drop(columns=columns_to_drop)
print(combined_features.columns)

In [None]:
import pandas as pd
import numpy as np
import ast
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import StandardScaler

# Create feature and label sets:

vector_embeddings_df = pd.read_csv('master_dataset.csv')
vector_embeddings_df['tokens'] = vector_embeddings_df['tokens'].apply(ast.literal_eval)
vector_embeddings_df['cleaned_lyrics'] = vector_embeddings_df['tokens'].apply(lambda x: ' '.join(x))

audio_features = ['danceability', 'energy', 'loudness', 'mode', 'acousticness', 'instrumentalness', 'valence', 'tempo']
X_audio = vector_embeddings_df[audio_features]
y = vector_embeddings_df['mood'].map({'Happy': 1, 'Sad': 0})
lyrics = vector_embeddings_df['cleaned_lyrics']

scaler = StandardScaler()
X_audio_scaled = scaler.fit_transform(X_audio)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def encode_lyrics(lyrics):
    inputs = tokenizer(lyrics, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return cls_embeddings

X_lyrics = np.vstack([encode_lyrics(lyric) for lyric in lyrics])

X_combined = np.hstack((X_audio_scaled, X_lyrics))

y = np.array([1 if mood == 'Happy' else 0 for mood in mood_list])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import warnings

# Pipeline
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])

# Parameter grid
param_grid_rf = {
    'rf__min_samples_leaf': [8, 10, 12],
    'rf__max_depth': list(range(2, 6)),
    'rf__max_features': ["sqrt", "log2"]
}
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv = 5, n_jobs = -1, scoring = 'accuracy')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid_search_rf.fit(X_combined, y)

print(grid_search_rf.best_params_)
print("Accuracy: ", grid_search_rf.best_score_ * 100)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier()

# Param grid
param_grid_mlp = {
    'hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'alpha': [0.0001, 0.001, 0.01, 0.05],
    'learning_rate': ['constant', 'adaptive'],
}

grid_search_mlp = GridSearchCV(mlp_clf, param_grid_mlp, cv = 10, n_jobs = -1, scoring = 'accuracy')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid_search_mlp.fit(X_combined, y)

print(grid_search_mlp.best_params_)
print("Accuracy: ", grid_search_mlp.best_score_ * 100)
best_estimator = grid_search_mlp.best_estimator_
y_pred = best_estimator.predict(X_combined)
print("Prediction List:", y_pred)

In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
import gc

# Load pre-trained RoBERTa model and tokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

master_df = pd.read_csv('master_dataset.csv')
y = master_df['mood'].map({'Happy': 1, 'Sad': 0})
columns_to_drop = ["track_names", "first_artists", "tokens", "embeddings", "danceability", "energy", "loudness", "mode", "acousticness", "instrumentalness", "valence", "tempo"]
cleaned_df = master_df.drop(columns=columns_to_drop)
X = cleaned_df['lyrics']

# Tokenize and encode lyrics
encoded_data = tokenizer(X.tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128)
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']

# Split data
X_train_ids, X_val_ids, y_train, y_val, X_train_mask, X_val_mask = train_test_split(input_ids, y, attention_mask, test_size=0.2, random_state=42)

y_train = torch.tensor(y_train.values)
y_val = torch.tensor(y_val.values)

# Move model to GPU (I was using Colab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Create TensorDataset and DataLoader
train_data = TensorDataset(X_train_ids, X_train_mask, y_train)
train_loader = DataLoader(train_data, batch_size=8)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 10  # We are currently using 10 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

model.train()
for epoch in range(10):
    total_loss = 0
    for batch in train_loader:
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Loss: {avg_train_loss}')
    
    gc.collect() # Clear cache and collect garbage due to Colab limits
    torch.cuda.empty_cache()
    
model.save_pretrained('fine_tuned_roberta')
model.eval()
val_data = TensorDataset(X_val_ids, X_val_mask, y_val)
val_loader = DataLoader(val_data, batch_size=8)
val_preds = []

with torch.no_grad():
    for batch in val_loader:
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)

        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        val_preds.extend(preds.cpu().numpy())

print(classification_report(y_val, val_preds))

In [None]:
# New data:
new_lyrics = "Your new lyrics here..."
encoded_new_lyrics = tokenizer(new_lyrics, padding=True, truncation=True, return_tensors='pt', max_length=128)
with torch.no_grad():
    new_lyrics_preds = model(input_ids=encoded_new_lyrics['input_ids'], attention_mask=encoded_new_lyrics['attention_mask'])
    predicted_mood = torch.argmax(new_lyrics_preds.logits).item()
    mood_label = "Happy" if predicted_mood == 0 else "Sad"
    print(f"Predicted mood: {mood_label}")