### This is the notebook for lyrics classification using BERT.

* Author: Alperen Demirci, Bora Dere

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
## import transformers library
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split


In [4]:
data = pd.read_csv("../data/cleaned_sampled_disorder_musics.csv")

label_dict = {
    'depression' : 0,
    'anxiety' : 1,
    'bipolar' : 2,
    'ptsd' : 3,
    'borderline' : 4,
    'panic' : 5,

}

labels = data['disorder'].map(label_dict)

In [5]:
data

Unnamed: 0,artist,lyric,type,title,user_id,src,disorder,lyric_length,unique_user_count
0,echo & the bunnymen,[verse 1]\nunder blue moon i saw you\nso soon ...,P,the killing moon,4353e884c1,SPOTIFY,anxiety,1159,436
1,louis tomlinson,[verse 1: bebe rexha]\ni know you say you know...,P,back to you (feat. bebe rexha & digital farm a...,b170e4337a,SPOTIFY,anxiety,2071,436
2,adele,[verse 1]\ndaydreamer\nsitting on the sea\nsoa...,P,daydreamer,57b5175168,SPOTIFY,anxiety,1008,436
3,one direction,"[verse 1: liam]\nyou're insecure, don't know w...",P,what makes you beautiful,8bdda90e71,SPOTIFY,anxiety,2308,436
4,beach weather,[verse 1: nick santino]\nlate night telephone\...,P,"sex, drugs, etc.",dd044ed3a8,SPOTIFY,anxiety,1242,436
...,...,...,...,...,...,...,...,...,...
3267,gorillaz,[hook: 2-d & brandon markell holmes]\ncircle o...,A,circle of friendz (feat. brandon markell holmes),4d1356518e,APPLE,ptsd,841,245
3268,the mighty mighty bosstones,[verse 1]\nwell he was fueled by a lack\ndrew ...,P,the rascal king,1bf579bc57,SPOTIFY,ptsd,1400,245
3269,nocap,"[pre-chorus]\nyeah, didn't write this song, bu...",A,ghetto angels,e77b72eef7,APPLE,ptsd,2461,245
3270,ynw melly,"[intro]\nyeah, yeah, yeah\nyoung nigga world\n...",A,virtual (blue balenciagas),7b53df78b5,APPLE,ptsd,2904,245


In [10]:
# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["lyric"], labels, test_size=0.2, random_state=42, stratify=labels
)

In [11]:
train_df = pd.DataFrame({
    'lyric': train_texts,
    'label': train_labels
})

val_df = pd.DataFrame({
    'lyric': val_texts,
    'label': val_labels
})

train_df.to_csv("../data/train_musics.csv", index=False)
val_df.to_csv("../data/val_musics.csv", index=False)

In [13]:
train_features, val_features, train_labels, val_labels = train_test_split(
    data.drop('lyric', axis=1), labels, test_size=0.2, random_state=42, stratify=labels
)

In [14]:
train_features

Unnamed: 0,artist,type,title,user_id,src,disorder,lyric_length,unique_user_count
2281,yg,T,toot it and boot it,7f20349dc1,SPOTIFY,panic,2584,23
1249,hailee steinfeld,P,love myself,3324437f73,SPOTIFY,borderline,2408,48
802,super furry animals,P,zoom!,8222eada89,SPOTIFY,bipolar,957,133
317,the maine,A,slip the noose (orchestral),d9d9416fb0,SPOTIFY,anxiety,1794,436
1364,girl in red,P,girls,48e275ed8e,SPOTIFY,borderline,1178,48
...,...,...,...,...,...,...,...,...
2072,the killers,T,mr. brightside,d066bd53a8,SPOTIFY,depression,1546,772
412,andrew bird,A,andalucia,6b17239a5b,SPOTIFY,anxiety,647,436
3237,neil diamond,A,"cherry, cherry",07a651a06f,APPLE,ptsd,1160,245
834,solange,A,"interlude: i got so much magic, you can have i...",94a96931df,APPLE,bipolar,307,133


In [16]:
train_features['disorder'] = train_labels
val_features['disorder'] = val_labels

train_features.to_csv("../data/train_musics_features.csv", index=False)
val_features.to_csv("../data/val_musics_features.csv", index=False)

In [17]:
train_df = pd.read_csv("../data/train_musics.csv")

In [18]:
train_df

Unnamed: 0,lyric,label
0,"[hook: ty dolla sign]\ni met her at the club, ...",5
1,"[written by julia michaels, justin tranter, os...",4
2,took you to a movie but you moved away\nfroze ...,2
3,"[chorus]\nbreak down, hysteric and young\nunco...",1
4,[verse 1]\ni've been hiding for so long\nthese...,4
...,...,...
2612,[verse 1]\ncoming out of my cage and i've been...,0
2613,[verse 1]\nandalucia\nwhen can i see you\nwhen...,1
2614,"baby loves me, yes, yes she does\nah, the girl...",3
2615,[a cappella interlude: nia andrews and kelly r...,2


In [25]:
val_df

Unnamed: 0,text,label
56586,WAIT SO IF A DIFFERENT ACC HAS IT U CAN USE I...,3
39388,i &lt;3 spike #AnimalCrossing #ACNH #NintendoS...,0
46164,Got way too drunk last night and ruined my lif...,5
27178,I just voted for to win MV Congeniality Star ...,4
42087,What I'm trying to say is we're super casual a...,5
...,...,...
27502,I want to customize my car horn to play the An...,4
45085,Just finished watching and I NEED SEASON 4 #S...,5
675,I used to have a hourglass shaped body. I used...,1
59713,We will miss you. I remember that unforgett...,3


### load the cleaned text data

In [4]:
train_df = pd.read_csv("../data/processed_train_tweets.csv")
val_df = pd.read_csv("../data/processed_val_tweets.csv")

In [5]:
train_df.drop('Unnamed: 0', axis=1, inplace=True)
val_df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
val_df.dropna(inplace=True)

In [7]:
train_df

Unnamed: 0,text,label
0,# titlesformelaniasmemoir . want money give f ...,4
1,"apparently mr ford fidelli quite enjoy it , ev...",3
2,jajajajjaa aaaaa eso por lo que anduvimos habl...,4
3,dante demon form # ocs # hungrybeastsverse # b...,3
4,lol probably practice diligently pull,1
...,...,...
47995,meant suppress stress . stress need to . need ...,3
47996,"wan na gun cause rappin trappin , lots cappin .",4
47997,butternut favourite name bread . hah,4
47998,"know , give prince charles lot shit , big crim...",3


In [10]:
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel

import torch
from tqdm import tqdm
import pandas as pd

data = val_df
# Initialize BERT tokenizer and model
model_name = "mental/mental-bert-base-uncased"  # You can choose other variants like "bert-large-uncased"
"""tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
"""
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
model = AutoModel.from_pretrained("mental/mental-bert-base-uncased")

# Ensure the model is in evaluation mode
model.eval()

# Function to extract features from text
def extract_features(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract features from the last hidden state
    # Use the [CLS] token's embedding for a fixed-size representation
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().numpy()

# Process dataset and extract features
features = []
for text in tqdm(data['text'], desc="Extracting Features"):
    features.append(extract_features(text))

# Convert features to a DataFrame and save them
features_df = pd.DataFrame(features)
features_df['label'] = data['label']  # Add the original labels
features_df.to_csv("mental-bert_features_test.csv", index=False)

print("Feature extraction complete! Saved to bert_features.csv.")


Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Extracting Features: 100%|██████████| 11999/11999 [06:24<00:00, 31.21it/s]


Feature extraction complete! Saved to bert_features.csv.
