### This is the notebook for comments classification using BERT.

* Author: Alperen Demirci, Bora Dere

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
## import transformers library
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split


In [16]:
data = pd.read_csv("../data/cleaned_anon_disorder_tweets.csv")

label_dict = {
    'depression' : 0,
    'anxiety' : 1,
    'bipolar' : 2,
    'ptsd' : 3,
    'borderline' : 4,
    'panic' : 5,

}

labels = data['disorder'].map(label_dict)

In [17]:
data

Unnamed: 0,user_id,text,disorder,word_count,char_count,text_len
0,96a0386390,Times have changed now Marion we use somethi...,anxiety,23,113,113
1,19c057a617,He has done some good things regardless of h...,anxiety,10,56,56
2,5e6675bfee,"Even if we fight a lot, I still want you in my...",anxiety,13,51,51
3,166b8a2abe,I never liked cilantro before &amp; actually...,anxiety,19,103,103
4,b4fde68e4d,"I've resorted to saying ""I'm ok"" when anyone a...",anxiety,18,86,86
...,...,...,...,...,...,...
59995,ff636b39ee,\n\n#MentalHealthAwareness \n...,ptsd,5,108,108
59996,3799f77d9e,"Hi, I'm Spider Cat.\nIt was this or Migraine S...",ptsd,12,64,64
59997,3dfef3d920,So do we still believe that Enrico Tarrio was...,ptsd,17,78,78
59998,12a95b032b,No matter how bad we ended if you ever need an...,ptsd,15,79,79


In [18]:
# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["text"], labels, test_size=0.2, random_state=42, stratify=labels
)

In [22]:
train_df = pd.DataFrame({
    'text': train_texts,
    'label': train_labels
})

val_df = pd.DataFrame({
    'text': val_texts,
    'label': val_labels
})

train_df.to_csv("../data/train_tweets.csv", index=False)
val_df.to_csv("../data/val_tweets.csv", index=False)

In [23]:
train_df

Unnamed: 0,text,label
21505,#TitlesForMelaniasMemoir. All I wanted was mo...,4
54750,Apparently Mr Ford and Fidelli quite enjoy...,3
22053,Jajajajjaa aaaaa eso por lo que anduvimos hab...,4
55377,Dante Demon Form\n#OCs #HungryBeastsVerse #Blo...,3
3997,Lol he probably practiced very diligently to ...,1
...,...,...
51581,I meant suppress not stress. Stress too if y...,3
22891,wanna gun me down cause i be rappin and trappi...,4
20662,Butternut has to be my favourite name for a b...,4
52885,"You know, we give Prince Charles a lot of shit...",3


In [25]:
val_df

Unnamed: 0,text,label
56586,WAIT SO IF A DIFFERENT ACC HAS IT U CAN USE I...,3
39388,i &lt;3 spike #AnimalCrossing #ACNH #NintendoS...,0
46164,Got way too drunk last night and ruined my lif...,5
27178,I just voted for to win MV Congeniality Star ...,4
42087,What I'm trying to say is we're super casual a...,5
...,...,...
27502,I want to customize my car horn to play the An...,4
45085,Just finished watching and I NEED SEASON 4 #S...,5
675,I used to have a hourglass shaped body. I used...,1
59713,We will miss you. I remember that unforgett...,3


### load the cleaned text data

In [13]:
train_df = pd.read_csv("../data/processed_train_tweets.csv")
val_df = pd.read_csv("../data/processed_val_tweets.csv")

In [14]:
train_df.drop('Unnamed: 0', axis=1, inplace=True)
val_df.drop('Unnamed: 0', axis=1, inplace=True)

In [15]:
val_df.dropna(inplace=True)

In [16]:
train_df

Unnamed: 0,text,label
0,# titlesformelaniasmemoir . want money give f ...,4
1,"apparently mr ford fidelli quite enjoy it , ev...",3
2,jajajajjaa aaaaa eso por lo que anduvimos habl...,4
3,dante demon form # ocs # hungrybeastsverse # b...,3
4,lol probably practice diligently pull,1
...,...,...
47995,meant suppress stress . stress need to . need ...,3
47996,"wan na gun cause rappin trappin , lots cappin .",4
47997,butternut favourite name bread . hah,4
47998,"know , give prince charles lot shit , big crim...",3


In [18]:
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel

import torch
from tqdm import tqdm
import pandas as pd

data = train_df
# Initialize BERT tokenizer and model
model_name = "mental/mental-bert-base-uncased"  # You can choose other variants like "bert-large-uncased"
"""tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
"""
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
model = AutoModel.from_pretrained("google-bert/bert-base-multilingual-cased")

# Ensure the model is in evaluation mode
model.eval()

# Function to extract features from text
def extract_features(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract features from the last hidden state
    # Use the [CLS] token's embedding for a fixed-size representation
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().numpy()

# Process dataset and extract features
features = []
for text in tqdm(data['text'], desc="Extracting Features"):
    features.append(extract_features(text))

# Convert features to a DataFrame and save them
features_df = pd.DataFrame(features)
features_df['label'] = data['label']  # Add the original labels
features_df.to_csv("multi-bert_features_train.csv", index=False)

print("Feature extraction complete! Saved to bert_features.csv.")


Extracting Features: 100%|██████████| 48000/48000 [24:02<00:00, 33.27it/s]


Feature extraction complete! Saved to bert_features.csv.
