In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 67.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 45.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [2]:
import pandas as pd
import numpy as np

In [5]:
df =  pd.read_csv('Emotional_Support_Dataset_new.csv',index_col=0)

In [6]:
df.head()

Unnamed: 0,informational_support,emotional_support,esteem_support,tangible_support,network_support,Question_Combined
0,1,1,0,0,0,"trouble sleeping? I am a 16yo male, and I have..."
1,0,1,1,0,0,Is it cancer??? A few days ago I started getti...
2,1,1,0,0,0,*doctor advice or information needed please*? ...
3,1,1,0,0,0,is yelling all the time for little things a me...
4,1,1,0,0,0,How can I prevnet haveing a heart attack? I'm ...


In [7]:
df['Question_Combined'] = df['Question_Combined'].str.lower()

In [8]:
df.columns

Index(['informational_support', 'emotional_support', 'esteem_support',
       'tangible_support', 'network_support', 'Question_Combined'],
      dtype='object')

In [9]:
df['emotional_esteem'] = df['emotional_support'] + df['esteem_support']
df['emotional_esteem'] = df['emotional_esteem'].apply(lambda x: 1 if x>=1 else 0)

df['informational_tangible'] = df['informational_support'] + df['tangible_support']
df['informational_tangible'] = df['informational_tangible'].apply(lambda x: 1 if x>=1 else 0)

In [10]:
df.drop(columns=['emotional_support', 'esteem_support',
                'informational_support', 'tangible_support'],
        inplace=True)
df.head()

Unnamed: 0,network_support,Question_Combined,emotional_esteem,informational_tangible
0,0,"trouble sleeping? i am a 16yo male, and i have...",1,1
1,0,is it cancer??? a few days ago i started getti...,1,0
2,0,*doctor advice or information needed please*? ...,1,1
3,0,is yelling all the time for little things a me...,1,1
4,0,how can i prevnet haveing a heart attack? i'm ...,1,1


In [11]:
df = df[['Question_Combined', 'network_support',
         'emotional_esteem', 'informational_tangible']]
df.head()

Unnamed: 0,Question_Combined,network_support,emotional_esteem,informational_tangible
0,"trouble sleeping? i am a 16yo male, and i have...",0,1,1
1,is it cancer??? a few days ago i started getti...,0,1,0
2,*doctor advice or information needed please*? ...,0,1,1
3,is yelling all the time for little things a me...,0,1,1
4,how can i prevnet haveing a heart attack? i'm ...,0,1,1


In [12]:
y = df.iloc[:, 1:].values.astype(float)

In [13]:
unique, counts = np.unique(y[:, 2], return_counts=True)
print(np.asarray((unique, counts)).T)

[[0.000e+00 4.850e+02]
 [1.000e+00 1.015e+03]]


## Splitting the Data into train and test set

In [14]:
X = df['Question_Combined'].values

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [16]:
unique, counts = np.unique(y_train[:, 2], return_counts=True)
print(np.asarray((unique, counts)).T)

[[  0. 387.]
 [  1. 813.]]


### Set up GPU

In [17]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [18]:
from transformers import RobertaTokenizer,DistilBertTokenizer, BertTokenizer,BertModel, RobertaModel, AlbertModel, GPT2Model, DistilBertModel, AdamW, get_linear_schedule_with_warmup


In [19]:
model_name = 'bert-base-uncased'
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import BertTokenizer



# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:

        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [None]:
# Concatenate train data and test data
all_tweets = X

# Encode our concatenated data
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_tweets]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (875 > 512). Running this sequence through the model will result in indexing errors


Max length:  875


In [None]:
# Specify `MAX_LEN`
MAX_LEN = 512

# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ', X[0])
print('Token IDs: ', token_ids)

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  a pain in the neck! (literally), please help? let me start by saying, i am playing the violin in 6 different groups as of now and working at a doctors office full time, so this may be stress or violin related pain. 

a few days ago i started getting this shoulder pain in my right shoulder, i passed it off as tendonitis from violin playing, today it has started hurting all over my shoulder, mainly in the front and back near the top (this is all near the joint). also, when i take a deep breath, i get this pain a few inches below my arm-pit on my side, over (or under?) my richs, this is also on my right side. when taking deep breaths it also hurts my shoulder more and a little spot on the lower, back/right side of my  neck.
what could this possibly be? it hurts like heck!
serious answers only please.
thank you
Token IDs:  [101, 1037, 3255, 1999, 1996, 3300, 999, 1006, 6719, 1007, 1010, 3531, 2393, 1029, 2292, 2033, 2707, 2011, 3038, 1010, 1045, 2572, 2652, 1996, 6710, 1999

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(torch.from_numpy(y_test))

batch_size = 8

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

  """


In [None]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):


    def __init__(self, freeze_bert=True):

        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 3

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained(model_name)

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            # nn.Dropout(0.4),
            nn.Linear(H, D_out),
            # nn.Sigmoid()
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 45 µs, sys: 0 ns, total: 45 µs
Wall time: 48.9 µs


crossentropy, crossentropywithlogits

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=True)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=1e-4,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

## Train Model

In [None]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            # print(logits, b_labels)
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)

        if evaluation == True:


            val_loss, val_accuracy = evaluate(model, val_dataloader)

            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        # print(logits, b_labels)
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        # preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = ((logits>0.5)*1.0 == b_labels).all(dim=1).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
# for id, mask, label in train_dataloader:
#     id, mask, label = [item.to(device) for item in [id, mask, label]]
#     logits = bert_classifier(id, mask)
#     # print(logits, label)
#     loss = loss_fn(logits, b_labels)
#     print(loss)
#     break

In [None]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=25)
train(bert_classifier, train_dataloader, val_dataloader, epochs=25, evaluation=True)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   1.406461   |     -      |     -     |   7.71   
   1    |   40    |   1.344267   |     -      |     -     |   4.93   
   1    |   60    |   1.484607   |     -      |     -     |   5.02   
   1    |   80    |   1.499253   |     -      |     -     |   5.01   
   1    |   100   |   1.506781   |     -      |     -     |   5.04   
   1    |   120   |   1.421890   |     -      |     -     |   5.11   
   1    |   140   |   1.454676   |     -      |     -     |   5.11   
   1    |   160   |   1.495421   |     -      |     -     |   5.13   
   1    |   180   |   1.562110   |     -      |     -     |   5.19   
   1    |   200   |   1.384369   |     -      |     -     |   5.25   
   1    |   212   |   1.444778   |     -      |     -     |   3.03   
----------------------------------------------------------------------

## Checking the performance on the input dataset

In [None]:
    train_set_predictions = []
    train_set_labels = []
    train_acc = []
    for batch in train_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        # Compute logits
        with torch.no_grad():
            logits = bert_classifier(b_input_ids, b_attn_mask)
            predictions = (logits>0.5)*1.0
            b_acc = (predictions == b_labels).all(dim=1) * 1.0

            train_set_labels.extend([i for i in b_labels.to('cpu').numpy()])
            train_set_predictions.extend([i for i in predictions.to('cpu').numpy()])
        train_acc.append(b_acc.mean().item())



In [None]:
train_set_labels = np.array(train_set_labels)
train_set_predictions = np.array(train_set_predictions)

In [None]:
unique, counts = np.unique(train_set_predictions[:, 1], return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0. 1700.]]


In [None]:
unique, counts = np.unique(train_set_predictions[:, 2], return_counts=True)
print(np.asarray((unique, counts)).T)

[[  0. 905.]
 [  1. 795.]]


In [None]:
unique, counts = np.unique(train_set_predictions[:, 0], return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0. 1700.]]


In [None]:
unique, counts = np.unique(train_set_labels[:, 2], return_counts=True)
print(np.asarray((unique, counts)).T)

[[0.000e+00 5.640e+02]
 [1.000e+00 1.136e+03]]


In [None]:
np.mean(train_acc)

0.21889671361502347

In [None]:
validation_set_predictions = []
validation_set_labels = []
valid_acc = []
for batch in val_dataloader:
    # Load batch to GPU
    b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
    validation_set_labels.extend([i for i in b_labels.to('cpu').numpy()])
    # Compute logits
    with torch.no_grad():
        logits = bert_classifier(b_input_ids, b_attn_mask)
        predictions = (logits>0.5)*1.0
        b_acc = (predictions == b_labels).all(dim=1) * 1.0
    validation_set_predictions.extend([i for i in predictions.to('cpu').numpy()])
    valid_acc.append(b_acc.mean().item())

In [None]:
np.mean(valid_acc)

0.1736111111111111

## Evaluations using scikit learn

In [None]:
validation_set_predictions =  np.array(validation_set_predictions)

In [None]:
validation_set_labels = np.array(validation_set_labels)

In [None]:
df.columns

Index(['Question_Combined', 'network_support', 'emotional_esteem',
       'informational_tangible'],
      dtype='object')

In [None]:
network_support_label = validation_set_labels[:, 0]
emotional_esteem_label = validation_set_labels[:, 1]
informational_tangible_label = validation_set_labels[:, 2]

In [None]:
network_support_predictions = validation_set_predictions[:, 0]
emotional_esteem_predictions = validation_set_predictions[:, 1]
informational_tangible_predictions = validation_set_predictions[:, 2]

In [None]:
from sklearn.metrics import classification_report

print(classification_report(network_support_label, network_support_predictions))

              precision    recall  f1-score   support

         0.0       0.58      1.00      0.73       247
         1.0       0.00      0.00      0.00       179

    accuracy                           0.58       426
   macro avg       0.29      0.50      0.37       426
weighted avg       0.34      0.58      0.43       426



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(classification_report(emotional_esteem_label, emotional_esteem_predictions))

              precision    recall  f1-score   support

         0.0       0.69      1.00      0.82       296
         1.0       0.00      0.00      0.00       130

    accuracy                           0.69       426
   macro avg       0.35      0.50      0.41       426
weighted avg       0.48      0.69      0.57       426



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(classification_report(informational_tangible_label, informational_tangible_predictions))

              precision    recall  f1-score   support

         0.0       0.30      0.45      0.36       146
         1.0       0.61      0.45      0.52       280

    accuracy                           0.45       426
   macro avg       0.45      0.45      0.44       426
weighted avg       0.50      0.45      0.46       426



In [None]:
unique, counts = np.unique(informational_tangible_label, return_counts=True)
print(np.asarray((unique, counts)).T)

[[  0. 146.]
 [  1. 280.]]
