In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import  AdamW
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import pandas as pd
import numpy as np
import spacy
import re
import math
import gc
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from transformers import BertTokenizer
import ast
from torch.nn.utils.rnn import pad_sequence


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# class PositionalEncoding(nn.Module):

#     def __init__(self, d_model, dropout=0.1, max_len=24000):
#         super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)

#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         return self.dropout(x)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=23187):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=0.1)

        pe = torch.zeros((max_len, d_model)) 
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


    
    
class TransformerTextClassifier(nn.Module):
    def __init__(self,ntoken, ninp, nhead, nhid, nlayers, num_classes, dropout=0.5, norm_first=False):
        super(TransformerTextClassifier, self).__init__()
        self.pos_encoder = PositionalEncoding(ninp)
        encoder_layers = nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout, norm_first=norm_first)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.fc = nn.Linear(ninp, num_classes-1)
        self.activation =nn.LeakyReLU
        
        self.init_weights()

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = output.mean(dim=1)  # Pooling layer (e.g., mean pooling)
        output = self.fc(output)
        # output = torch.sigmoid(output)  
        return output

    def init_weights(self):
        initrange = 0.1
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
                m.bias.data.zero_()


    # def init_weights(self):
    #     initrange = 0.1
    #     self.fc.weight.data.uniform_(-initrange, initrange)
    #     self.fc.bias.data.zero_()



class IDDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        id = self.input_ids[idx]
        label = self.labels[idx]

        return {
            'input_ids': id,
            'labels': label
        }

class TextDataset(Dataset):
    def __init__(self, token_ids, labels):
        self.token_ids = token_ids
        self.labels = labels

    def __len__(self):
        return len(self.token_ids)

    def __getitem__(self, idx):
        token_id = self.token_ids[idx]
        label = self.labels[idx]
        
        return {
            'input_ids': torch.tensor(token_id, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [2]:
def preprocess_text(txt:str):

    txt = re.sub('[^a-zA-Z]', ' ', txt)
    txt = txt.lower()
    txt = " ".join(txt.split())

    doc = nlp(txt)

    tokens_filtered = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue

        tokens_filtered.append(token.lemma_)

    return " ".join(tokens_filtered)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_and_convert(text):
    # Tokenize the text using spaCy
    spaCy_tokens = [token.text for token in nlp(text)]
    
    # Convert spaCy tokens to strings
    token_strings = [str(token) for token in spaCy_tokens]
    
    # Map token strings to numerical IDs using the pre-trained tokenizer
    token_ids = tokenizer.convert_tokens_to_ids(token_strings)
    
    return token_ids

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
# Load the CSV file
data = pd.read_csv('mental-health.csv')
# Display the first few rows of the DataFrame
print(data.head())
data = data.drop_duplicates(ignore_index = True)
df_null_values = data.isnull().sum().to_frame().rename(columns = {0:'count'})
df_null_values['%'] = (df_null_values['count'] / len(data)) * 100
df_null_values



                                                text         label
0  I recently went through a breakup and she said...    depression
1  I do not know how to navigate these feelings, ...    depression
2  So I have been with my bf for 5 months , and h...    depression
3  I am so exhausted of this. Just when I think I...  SuicideWatch
4  I have been severly bullied since i was 5 till...    depression


Unnamed: 0,count,%
text,0,0.0
label,0,0.0


In [5]:
data['text_prep'] = data['text'].apply(preprocess_text)


In [6]:
data['token_id'] =  data['text_prep'].apply(tokenize_and_convert)



In [7]:
LABELS = data['label'].unique()
label2id = dict(zip(LABELS, np.arange(len(LABELS), dtype = np.float32)))
data['label_prep'] = data['label'].map(label2id)


In [8]:
data.dropna(subset=['text_prep'], inplace=True)

In [9]:
data.to_csv('preprocessed_data.csv', index=False)

In [4]:
##########
## load data if the preprocessed instead of reprocessed
data = pd.read_csv('preprocessed_data.csv')


In [5]:
def convert_string_to_array(s):
    return np.array(ast.literal_eval(s))

data['token_id']  = data['token_id'] .apply(convert_string_to_array)

In [6]:
data.head()

Unnamed: 0,text,label,text_prep,token_id,label_prep
0,I recently went through a breakup and she said...,depression,recently go breakup say want friend say try ta...,"[3728, 2175, 19010, 2360, 2215, 2767, 2360, 30...",0.0
1,"I do not know how to navigate these feelings, ...",depression,know navigate feeling new feeling stretch unde...,"[2113, 22149, 3110, 2047, 3110, 7683, 3305, 27...",0.0
2,"So I have been with my bf for 5 months , and h...",depression,bf month tell depressed week particular happen...,"[28939, 3204, 2425, 14777, 2733, 3327, 4148, 2...",0.0
3,I am so exhausted of this. Just when I think I...,SuicideWatch,exhausted think finally rest think maybe thing...,"[9069, 2228, 2633, 2717, 2228, 2672, 2518, 270...",1.0
4,I have been severly bullied since i was 5 till...,depression,severly bully till result depressed misanthrop...,"[100, 20716, 6229, 2765, 14777, 100, 100, 3674...",0.0


In [7]:
X = data['token_id']
y = data['label_prep']


SEED = 1235
train_ids_0, test_ids, train_labels_0, test_labels = tts(X, y, test_size = 0.1, random_state = SEED)
train_ids, val_ids, train_labels, val_labels = tts(train_ids_0, train_labels_0, test_size = 0.2, random_state = SEED)

In [8]:
# Hyperparameters
num_attention_heads = int(np.random.uniform(2, 4))
num_attention_heads = 8
embedding_size = int(np.random.uniform(120, 170)) # ninp should be bigger
embedding_size = embedding_size - embedding_size % num_attention_heads
nhidden = int(np.random.uniform(50, 300))
# nhidden = 70
nlayers = int(np.random.uniform(2, 12))
# nlayers = 4
Dropout = np.random.uniform(0.1, 0.3)
criterion = nn.CrossEntropyLoss()
learning_rate = np.random.uniform(1e-3, 0.01)

num_epochs = int(np.random.uniform(2, 4))
batch_size = 32




In [None]:
for lexeme in list(nlp.vocab)[540:940]:
    # Access lexeme attributes such as orth (the text), is_alpha, is_stop, etc.
    print("Text:", lexeme.text)
    print("Text:", lexeme.orth_)
    

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# torch.cuda.set_device(0)

In [10]:

#old
# vectorizer = TfidfVectorizer()
# train_vectorizer = vectorizer.fit_transform(train_texts)
# val_vectorizer   = vectorizer.transform(val_texts)
# test_vectorizer  = vectorizer.transform(test_texts)

# train_vectorizer = train_vectorizer.toarray()
# val_vectorizer   = val_vectorizer.toarray()
# test_vectorizer  = test_vectorizer.toarray()

# _, ntoken = train_vectorizer.shape


# train_dataset = TextDataset(train, train_labels)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_dataset = TensorDataset(torch.tensor(val_vectorizer, dtype=torch.float), torch.tensor(val_labels_array, dtype=torch.float))
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_dataset = TensorDataset(torch.tensor(test_vectorizer, dtype=torch.float), torch.tensor(test_labels_array, dtype=torch.float))


#new
ntoken = len(nlp.vocab)
val_labels_array = val_labels.values.astype(float)
test_labels_array = test_labels.values.astype(float)

# train_dataset = TextDataset(train_ids, train_labels)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# val_dataset = TextDataset(val_ids, val_labels)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# test_dataset = TextDataset(test_ids, test_labels) 
padded_train = pad_sequence([torch.tensor(seq) for seq in train_ids], batch_first=True, padding_value=0)
padded_val = pad_sequence([torch.tensor(seq) for seq in val_ids], batch_first=True, padding_value=0)
padded_test = pad_sequence([torch.tensor(seq) for seq in test_ids], batch_first=True, padding_value=0)

train_dataset = TensorDataset(torch.tensor(padded_train, dtype=torch.float), torch.tensor(train_labels, dtype=torch.float))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(torch.tensor(padded_val, dtype=torch.float), torch.tensor(val_labels_array, dtype=torch.float))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset =TensorDataset(torch.tensor(padded_test, dtype=torch.float), torch.tensor(test_labels_array, dtype=torch.float))


  train_dataset = TensorDataset(torch.tensor(padded_train, dtype=torch.float), torch.tensor(train_labels, dtype=torch.float))
  val_dataset = TensorDataset(torch.tensor(padded_val, dtype=torch.float), torch.tensor(val_labels_array, dtype=torch.float))
  test_dataset =TensorDataset(torch.tensor(padded_test, dtype=torch.float), torch.tensor(test_labels_array, dtype=torch.float))


In [11]:
# Print the hyperparameters
print("Hyperparameters=")
print(f"num_attention_heads= {num_attention_heads}")
print(f"embedding_size= {embedding_size}")
print(f"nhidden= {nhidden}")
print(f"nlayers= {nlayers}")
print(f"dropout= {Dropout}")
print("criterion=", criterion)
print(f"learning_rate= {learning_rate}")
print(f"num_epochs= {num_epochs}")
print(f"batch_size= {batch_size}")
print(f"ntoken= {ntoken}")

Hyperparameters=
num_attention_heads= 8
embedding_size= 152
nhidden= 259
nlayers= 9
dropout= 0.11944741052411867
criterion= CrossEntropyLoss()
learning_rate= 0.0025338915511665923
num_epochs= 3
batch_size= 32
ntoken= 764


In [14]:
# model = TransformerModel(ntoken, embedding_size, num_attention_heads, nhidden, nlayers, Dropout, norm_first=True).to(device)
model = TransformerTextClassifier( ntoken, embedding_size, num_attention_heads, nhidden, nlayers,2, Dropout, norm_first=True).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)



In [None]:
## in order to run on a trained checkpoint: 

checkpoint = torch.load("model_checkpoint5.pth")
model.load_state_dict(checkpoint)

In [15]:
for epoch in range(num_epochs):
    model.train()
    total_correct=0.0
    total_samples=0.0
    for batch_idx, (texts, labels) in enumerate(train_loader):
    # for (texts, labels) in train_dataset:
        model.train()
        # Convert texts and labels to tensors if necessary
        texts = texts.to(device).long()  
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(texts, None)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()

        predictions = (outputs > 0.5).float()
        total_correct += (predictions[:,0] == labels).sum().item()
        total_samples += labels.size(0)
        
        if batch_idx % 50 == 1:
            accuracy = total_correct / total_samples
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item()}, train_acc: {100. * accuracy:.2f}%")
            total_correct = 0.0
            total_samples = 0.0

 
    # Validation
    model.eval()
    with torch.no_grad():
        total_correct = 0
        total_samples = 0
        for texts, labels in val_loader:
            texts = texts.to(device).long()  
            labels = labels.to(device)
            outputs = model(texts, None)
            predictions = (outputs > 0.5).float()
            total_correct += (predictions[:,0] == labels).sum().item()
            total_samples += labels.size(0)
            accuracy = total_correct / total_samples

    print(f'Epoch [{epoch+1}/{num_epochs}], index : {batch_idx}, Val Acc: {accuracy:.2f}%')

    # Free GPU memory
    torch.cuda.empty_cache()

../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [25,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [25,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [25,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [25,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [25,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [25,0,0], thread: [101,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [25,0,0],

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
import math
def accurary_est(y_pred, y_label):
  y_pred = [0 if torch.sigmoid(i) < 0.5 else 1 for i in torch.tensor(y_pred)]
  return sum([1 for res in range(len(y_pred)) if y_pred[res] == y_label[res]]) / len(y_pred)

In [None]:
# Save the model
torch.save(model.state_dict(), 'model_checkpoint5.pth')

In [None]:
# training results
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for texts, labels in train_loader:
        texts = texts.to(device).long()  
        labels = labels.to(device)
        outputs = model(texts, None)
        outputs = torch.sigmoid(outputs) 
        predictions = (outputs > 0.5).float()
        total_correct += (predictions[:,0] == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
        # print(predictions[:,0])
        # print(f'total_correct [{predictions.shape}, total_samples : {labels.shape}')
print(f'total_correct {total_correct}, total_samples : {total_samples}, Val Acc: {100. * accuracy:.2f}%')

NameError: name 'model' is not defined

In [None]:
# Validation
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for texts, labels in val_loader:
        texts = texts.to(device).long()  
        labels = labels.to(device)
        outputs = model(texts, None)
        # outputs = torch.sigmoid(outputs) 
        predictions = (outputs > 0.5).float()
        total_correct += (predictions[:,0] == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
        # print(predictions[:,0])
        # print(f'total_correct [{predictions.shape}, total_samples : {labels.shape}')
print(f'total_correct {total_correct}, total_samples : {total_samples}, Val Acc: {100. * accuracy:.2f}%')

total_correct 2411, total_samples : 3663, Val Acc: 65.82%


In [None]:
# Test results
model.eval()
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
with torch.no_grad():
    total_correct = 0
    total_samples = 0

    for texts, labels in test_loader:
        texts = texts.to(device).long()  
        labels = labels.to(device)
        outputs = model(texts, None)
        outputs = torch.sigmoid(outputs) 
        predictions = (outputs > 0.5).float()
        total_correct += (predictions[:,0] == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
print(f'total_correct {total_correct}, total_samples : {total_samples}, test Acc: {100. * accuracy:.2f}%')

total_correct 1308, total_samples : 2035, test Acc: 64.28%


In [None]:
torch.cuda.empty_cache()
optimizer.zero_grad()

In [None]:
def deallocate_tensors():
    for obj in gc.get_objects():
        if torch.is_tensor(obj) :
            if obj.device.type == 'cuda':
                del obj  # This removes the reference to the tensor
    torch.cuda.empty_cache()  # This releases any remaining GPU memory not in use

deallocate_tensors()



In [None]:
print(torch.cuda.memory_reserved()/(1024 ** 3))

# Release cached memory
torch.cuda.empty_cache()
gc.collect()

# After releasing cached memory
print(torch.cuda.memory_reserved()/(1024 ** 3))

0.27734375


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:

def check_gpu_capacity():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        properties = torch.cuda.get_device_properties(device)
        total_memory = properties.total_memory / (1024 ** 3)  # Convert bytes to gigabytes
        memory_allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)  # Allocated memory in use
        memory_cached = torch.cuda.memory_reserved(device) / (1024 ** 3)  # Cached but not currently in use
        free_memory = total_memory - memory_allocated - memory_cached
        print(f"Total GPU Memory: {total_memory:.2f} GB")
        print(f"Allocated Memory: {memory_allocated:.2f} GB")
        print(f"Cached Memory: {memory_cached:.2f} GB")
        print(f"Free Memory: {free_memory:.2f} GB")
    else:
        print("CUDA is not available.")
# gc.collect()
check_gpu_capacity()


Total GPU Memory: 10.91 GB
Allocated Memory: 0.11 GB
Cached Memory: 0.28 GB
Free Memory: 10.52 GB


In [None]:
# import torch.nn as nn
# class block(nn.Module):
# 	def __init__(self):
# 		super(block, self).__init__()
# 		self.attention = nn.MultiheadAttention(embeds_size, num_heads, batch_first=True)
# 		self.ffn = nn.Sequential(
# 			nn.Linear(embeds_size, 2 * embeds_size),
# 			nn.LeakyReLU(),
# 			nn.Linear(2 * embeds_size, embeds_size),
# 		)
# 		self.drop1 = nn.Dropout(drop_prob)
# 		self.drop2 = nn.Dropout(drop_prob)
# 		self.ln1 = nn.LayerNorm(embeds_size)
# 		self.ln2 = nn.LayerNorm(embeds_size)

# 	def forward(self, hidden_state):
# 		attn, _ = self.attention(hidden_state, hidden_state, hidden_state, need_weights=False)
# 		attn = self.drop1(attn)
# 		out = self.ln1(hidden_state + attn)
# 		observed = self.ffn(out)
# 		observed = self.drop2(observed)
# 		return self.ln2(out + observed)
	

# class transformer(nn.Module):
#     def __init__(self):
#         super(transformer, self).__init__()

#         self.tok_emb = nn.Embedding(vocab_size, embeds_size)
#         self.pos_emb = nn.Embedding(block_size, embeds_size)
#         self.block = block()
#         self.ln1 = nn.LayerNorm(embeds_size)
#         self.ln2 = nn.LayerNorm(embeds_size)

#         self.classifier_head = nn.Sequential(
#             nn.Linear(embeds_size, embeds_size),
#             nn.LeakyReLU(),
#             nn.Dropout(drop_prob),
#             nn.Linear(embeds_size, embeds_size),
#             nn.LeakyReLU(),
#             nn.Linear(embeds_size, num_classes),
#             nn.Softmax(dim=1),
#         )

#         print("number of parameters: %.2fM" % (self.num_params()/1e6,))

#     def num_params(self):
#         n_params = sum(p.numel() for p in self.parameters())
#         return n_params

#     def forward(self, seq):
#         B,T = seq.shape
#         embedded = self.tok_emb(seq)
#         embedded = embedded + self.pos_emb(torch.arange(T, device=device))
#         output = self.block(embedded)
#         output = output.mean(dim=1)
#         output = self.classifier_head(output)
#         return output
    

# model = transformer()
# model.to(device)
# vocab_size = 20000

# block_size = 200
# embeds_size = 100
# num_classes = 2
# drop_prob = 0.13
# batch_size = 32
# epochs = 30
# num_heads = 4
# head_size = embeds_size // num_heads
# model_path = 'model_classification.pth'
# model_loader = False


#     for epoch in range(epochs):
# 	losses = 0
# 	for (inputs, targets) in train_data:
# 		inputs = inputs.to(device)
# 		targets = targets.to(device)
# 		output = model(inputs)
# 		loss = model_loss(output, targets)
# 		model_optimizer.zero_grad()
# 		loss.backward()
# 		model_optimizer.step()
# 		losses += loss.item()
# 	print(f'[{epoch}][Train]', losses)
# 	model.eval()
# 	test_loss = 0
# 	passed = 0
# 	for (inputs, targets) in test_data:
# 		with torch.no_grad():
# 			inputs = inputs.to(device)
# 			targets = targets.to(device)
# 			outputs = model(inputs)
# 			if outputs.argmax() == targets.argmax():
# 				passed += 1
# 	model.train()
# 	print(f'[{epoch}][Test]', ', accuracy', passed / len(dataset_y))