In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import torch
from pathlib import Path
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from textblob import TextBlob
import logging

logging.basicConfig(level=logging.ERROR)

%matplotlib inline

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
#seed_everything(2020)
seed_everything(0)

In [None]:
!pip install folium==0.2.1 -q
!pip install textattack -q

In [None]:
data = pd.read_csv('/content/drive/My Drive/Tech4MentalHealth/data/Train.csv')

In [None]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
data_clean = pd.DataFrame(data.text.apply(round1))
data_clean = pd.DataFrame(data_clean.text.apply(round2))

In [None]:
data.text = data_clean.text
data.head()

Unnamed: 0,ID,text,label
0,SUAVK39Z,i feel that it was better i dieam happy,Depression
1,9JDAGUV3,why do i get hallucinations,Drugs
2,419WR1LQ,i am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,why is life important,Suicide
4,FYC0FTFB,how could i be helped to go through the depres...,Depression


In [None]:
examples = data[['text', 'label']].copy()
y_fac = data['label'].factorize()
examples.label = y_fac[0]

In [None]:
examples.to_csv('examples.csv', index=False)

AUGMENTATION_RECIPE_NAMES 

{
    "wordnet": "textattack.augmentation.WordNetAugmenter",
    "embedding": "textattack.augmentation.EmbeddingAugmenter",
    "charswap": "textattack.augmentation.CharSwapAugmenter",
}

In [None]:
!textattack augment --csv examples.csv --input-column text --recipe wordnet --num-words-to-swap 2 --transformations-per-example 1 #--exclude-original

In [None]:
augment_data = pd.read_csv('augment.csv')
augment_data.head()

Unnamed: 0,text,label
0,i feel that it was better i dieam happy,0
1,i feel that it was respectable i dieam glad,0
2,why do i get hallucinations,1
3,why do i amaze hallucination,1
4,i am stresseed due to lack of financial suppor...,0


In [None]:
from sklearn.utils import resample, shuffle

In [None]:
augment_data = shuffle(augment_data)
augment_data.head()

Unnamed: 0,text,label
18,is heaven open for us who smoke bhang,1
342,is there a peer counselling programme,0
467,what could i do to mess with frustration,1
851,what are the upshot of bhang,1
950,how can i get rid of depression,0


In [None]:
from sklearn.model_selection import train_test_split

#X = augment_data.text.values
#y = augment_data.label.values
X = data.text.values
y = y_fac[0] #data.label.values

X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
# Load test data
test_data = pd.read_csv('/content/drive/My Drive/Tech4MentalHealth/data/Test.csv')

test_data.text = test_data.text.apply(round1)
test_data.text = test_data.text.apply(round2)

test_data.head()

Unnamed: 0,ID,text
0,02V56KMO,how to overcome bad feelings and emotions
1,03BMGTOK,i feel like giving up in life
2,03LZVFM6,i was so depressed feel like got no strength t...
3,0EPULUM5,i feel so low especially since i had no one to...
4,0GM4C5GD,can i be successful when i am a drug addict


In [None]:
#y_train

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P4


In [None]:
import nltk
# Uncomment to download "stopwords"
nltk.download("stopwords")
from nltk.corpus import stopwords

def text_preprocessing(s):
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess text
X_train_preprocessed = np.array([text_preprocessing(text) for text in X_train])
X_val_preprocessed = np.array([text_preprocessing(text) for text in X_val])

# Calculate TF-IDF
tf_idf = TfidfVectorizer()

X_train_tfidf = tf_idf.fit_transform(X_train_preprocessed)
X_val_tfidf = tf_idf.transform(X_val_preprocessed)

CPU times: user 397 ms, sys: 56.9 ms, total: 454 ms
Wall time: 456 ms


In [None]:
#X_train_preprocessed[:10]

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, StratifiedShuffleSplit, cross_validate

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Compute predicted probabilities
nb_model = MultinomialNB(alpha=1.8)
nb_model.fit(X_train_tfidf, y_train)
probs = nb_model.predict_proba(X_val_tfidf)

# Evaluate the classifier
#evaluate_roc(probs, y_val)

In [None]:
train = np.array([text_preprocessing(text) for text in X])
train_idf = tf_idf.fit_transform(train)

In [None]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

In [None]:
clf = MultinomialNB()
cv_results_nb = cross_validate(clf, train_idf, y, cv=cv,scoring=('accuracy'),return_train_score=True)

In [None]:
np.mean(cv_results_nb['train_score']), np.mean(cv_results_nb['test_score'])

(0.8426829268292682, 0.764516129032258)

## Finetuning BERT

In [None]:
!pip install transformers -q #==2.11.0 -q

In [None]:
def text_preprocessing(text):
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,      # Return attention mask
            truncation=True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks



In [None]:
# Concatenate train data and test data
all_data = np.concatenate([data.text.values, test_data.text.values])

# Encode our concatenated data
encoded_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_data]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_texts])
print('Max length: ', max_len)

Max length:  38


In [None]:
# Specify `MAX_LEN`
MAX_LEN = 100

In [None]:
# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ', X[0])
print('Token IDs: ', token_ids)

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

Original:  i feel that it was better i dieam happy
Token IDs:  [101, 1045, 2514, 2008, 2009, 2001, 2488, 1045, 3280, 3286, 3407, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Tokenizing data...


#### Data loader

In [None]:

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
%%time
#import torch
import torch.nn as nn
from transformers import BertModel
import transformers

# Create the BertClassfier class
class BertClassifier(nn.Module):
    
    def __init__(self, freeze_bert=False):
       
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 4

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

  

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(H, D_out)
        )
        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
       
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        #logits, = self.model(input_ids=input_ids, attention_mask=attention_mask)

        return logits

CPU times: user 43 µs, sys: 21 µs, total: 64 µs
Wall time: 69.1 µs


lr=5e-5,    # Default learning rate
eps=1e-8    # Default epsilon value
no dropout # Default epsilon value

In [None]:
# from collections import  Counter

# labels_count = Counter(list(y))
# label_weights = [labels_count[0]/labels_count[0], labels_count[0]/labels_count[1], labels_count[0]/labels_count[2], labels_count[0]/labels_count[3]]
# label_weights = torch.from_numpy(np.array(label_weights))
# label_weights

In [None]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()


def train(model,optimizer, scheduler, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
       
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    #probs = F.softmax(all_logits, dim=1).cpu().numpy()
    probs = torch.sigmoid(all_logits).cpu().numpy()

    return probs

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(train_dataloader, lr, epochs=4):
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=lr,    # Default learning rate
                      eps=1e-6    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

In [None]:
def crossVall(X, y, test_data, k):
  predictions = []
  kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)

  for i, (train_index, val_index) in enumerate(kf.split(X, y)):
      X_train, X_val = X[train_index], X[val_index]
      y_train, y_val = y[train_index], y[val_index]

      train_inputs, train_masks = preprocessing_for_bert(X_train)
      val_inputs, val_masks = preprocessing_for_bert(X_val)

      train_labels = torch.tensor(y_train)
      val_labels = torch.tensor(y_val)

      batch_size = 16
      lr= 1e-4
      train_data = TensorDataset(train_inputs, train_masks, train_labels)
      train_sampler = RandomSampler(train_data)
      train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

      # Create the DataLoader for our validation set
      val_data = TensorDataset(val_inputs, val_masks, val_labels)
      val_sampler = SequentialSampler(val_data)
      val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

      #set_seed(2020)    # Set seed for reproducibility
      seed_everything(0)
      bert_classifier, optimizer, scheduler = initialize_model(train_dataloader, lr=lr, epochs=3)
      train(bert_classifier, optimizer, scheduler, train_dataloader, val_dataloader, epochs=3, evaluation=True)

      test_inputs, test_masks = preprocessing_for_bert(test_data.text)

      # Create the DataLoader for our test set
      test_dataset = TensorDataset(test_inputs, test_masks)
      test_sampler = SequentialSampler(test_dataset)
      test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

      probs = bert_predict(bert_classifier, test_dataloader)
      predictions.append(probs)

      print(f'******************************************************************************************************')
  return predictions

In [None]:
predictions = crossVall(X, y, test_data, 5)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.981169   |     -      |     -     |   7.33   
   1    |   30    |   0.817433   |     -      |     -     |   3.45   
----------------------------------------------------------------------
   1    |    -    |   0.928351   |  0.545982  |   80.99   |   11.55  
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   20    |   0.516825   |     -      |     -     |   7.31   
   2    |   30    |   0.382502   |     -      |     -     |   3.44   
----------------------------------------------------------------------
   2    |    -    |   0.473495   |  0.398597  |   87.50   |   11.53  
----------------------------------------------------------------

In [None]:
probs = predictions[0]
for p in predictions[1:]:
  probs = probs + p

probs = probs / len(predictions)
cols = ['Depression', 'Alcohol', 'Suicide', 'Drugs']

In [None]:
probs[:5]

array([[0.668847  , 0.32125652, 0.5935748 , 0.37265247],
       [0.87168264, 0.36309373, 0.29112014, 0.33808142],
       [0.872165  , 0.36667728, 0.28786415, 0.34008616],
       [0.8714949 , 0.3618462 , 0.2931542 , 0.3377226 ],
       [0.37495834, 0.75758165, 0.33942074, 0.5437318 ]], dtype=float32)

In [None]:
subcv = pd.read_csv('/content/drive/My Drive/Tech4MentalHealth/SampleSubmission.csv')

In [None]:
subcv[cols] = probs

In [None]:
subcv.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.726372,0.039653,0.900649,0.330668
1,03BMGTOK,0.998814,0.082974,0.215542,0.065024
2,03LZVFM6,0.998832,0.086674,0.226346,0.062449
3,0EPULUM5,0.998834,0.087499,0.230338,0.062268
4,0GM4C5GD,0.104409,0.988425,0.051728,0.670135


In [None]:
subcv = subcv.round(4)
subcv.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.8469,0.1094,0.802,0.3109
1,03BMGTOK,0.9964,0.0752,0.3789,0.0753
2,03LZVFM6,0.9964,0.0923,0.3619,0.0787
3,0EPULUM5,0.9964,0.0918,0.3727,0.0767
4,0GM4C5GD,0.2208,0.957,0.1455,0.5096


In [None]:
subcv.isnull().sum()

ID            0
Depression    0
Alcohol       0
Suicide       0
Drugs         0
dtype: int64

In [None]:
subcv.to_csv('bert_cv_5.csv')

In [None]:
# set_seed(2020)    # Set seed for reproducibility
seed_everything(0)
bert_classifier_70, optimizer_70, scheduler_70 = initialize_model(train_dataloader, lr=0.00015765668045462564, epochs=1)
train(bert_classifier_70, optimizer_70, scheduler_70, train_dataloader, val_dataloader, epochs=1, evaluation=True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.721568   |     -      |     -     |   2.99   
   1    |   40    |   0.342444   |     -      |     -     |   2.84   
   1    |   60    |   0.298672   |     -      |     -     |   2.84   
   1    |   66    |   0.281306   |     -      |     -     |   0.79   
----------------------------------------------------------------------
   1    |    -    |   0.442733   |  0.224324  |   93.32   |   10.59  
----------------------------------------------------------------------


Training complete!


In [None]:
preds = bert_predict(bert_classifier_70, test_dataloader)

In [None]:
preds[:10]

array([[5.8435512e-01, 1.7746354e-02, 3.5293564e-01, 4.4962913e-02],
       [9.9776292e-01, 4.8930751e-04, 1.5237356e-03, 2.2400389e-04],
       [9.9750996e-01, 4.1609415e-04, 1.8229393e-03, 2.5101993e-04],
       [9.9751842e-01, 5.3944153e-04, 1.7058271e-03, 2.3631383e-04],
       [2.6553471e-03, 4.5908135e-01, 1.3007770e-02, 5.2525556e-01],
       [9.9578530e-01, 5.7565846e-04, 3.2330388e-03, 4.0603071e-04],
       [9.8777997e-01, 8.3185476e-04, 1.0293129e-02, 1.0950903e-03],
       [9.9811256e-01, 4.1482673e-04, 1.3134124e-03, 1.5915725e-04],
       [2.5347032e-04, 5.9013732e-04, 5.6118594e-04, 9.9859530e-01],
       [2.0346777e-04, 2.0202547e-03, 8.9474872e-04, 9.9688160e-01]],
      dtype=float32)

In [None]:
sub_70 = pd.read_csv('/content/drive/My Drive/Tech4MentalHealth/SampleSubmission.csv')
cols = ['Depression', 'Alcohol', 'Suicide', 'Drugs']

In [None]:
sub_70[cols] = probs
sub_70.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.668847,0.321257,0.593575,0.372652
1,03BMGTOK,0.871683,0.363094,0.29112,0.338081
2,03LZVFM6,0.872165,0.366677,0.287864,0.340086
3,0EPULUM5,0.871495,0.361846,0.293154,0.337723
4,0GM4C5GD,0.374958,0.757582,0.339421,0.543732


In [None]:
sub_70.to_csv('bert_cv_2FC_3epochs.csv', index=False)

### Train on full dataset

In [None]:
# Concatenate the train set and the validation set
full_train_data = torch.utils.data.ConcatDataset([train_data, val_data])
full_train_sampler = RandomSampler(full_train_data)
full_train_dataloader = DataLoader(full_train_data, sampler=full_train_sampler, batch_size=32)

# Train the Bert Classifier on the entire training data
#set_seed(2020)
seed_everything(0)
bert_classifier, optimizer, scheduler = initialize_model(full_train_dataloader, lr=1e-4, epochs=3)
train(bert_classifier, optimizer, scheduler, full_train_dataloader, epochs=3)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.980031   |     -      |     -     |   6.43   
   1    |   40    |   0.456961   |     -      |     -     |   6.11   
   1    |   60    |   0.325661   |     -      |     -     |   6.11   
   1    |   76    |   0.230265   |     -      |     -     |   4.71   
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   20    |   0.128234   |     -      |     -     |   6.42   
   2    |   40    |   0.116622   |     -      |     -     |   6.11   
   2    |   60    |   0.091163   |     -      |     -     |   6.10   
   2    |   76    |   0.070930   |     -      |     -     |   4.72   
------------------------------------------------------------------

## Prediction on test set

In [None]:
# # Run `preprocessing_for_bert` on the test set
# print('Tokenizing data...')
test_inputs, test_masks = preprocessing_for_bert(test_data.text)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

In [None]:
# Compute predicted probabilities on the test set
probs_full = bert_predict(bert_classifier, test_dataloader)

In [None]:
probs_full[:5]

array([[0.41843504, 0.04451568, 0.98972964, 0.39334914],
       [0.999476  , 0.09855366, 0.10817511, 0.05696144],
       [0.99945253, 0.10343424, 0.12913641, 0.05879024],
       [0.9993673 , 0.10774244, 0.13338432, 0.05514483],
       [0.03806748, 0.9970703 , 0.0494557 , 0.5965507 ]], dtype=float32)

In [None]:
sub=pd.read_csv('/content/drive/My Drive/Tech4MentalHealth/SampleSubmission.csv')
sub.head(2)

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0,0,0,0
1,03BMGTOK,0,0,0,0


In [None]:
sub['Depression']=probs_full[:,0]
sub['Alcohol']=probs_full[:,3]
sub['Suicide']=probs_full[:,2]
sub['Drugs']=probs_full[:,1]

In [None]:
sub.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.418435,0.393349,0.98973,0.044516
1,03BMGTOK,0.999476,0.056961,0.108175,0.098554
2,03LZVFM6,0.999453,0.05879,0.129136,0.103434
3,0EPULUM5,0.999367,0.055145,0.133384,0.107742
4,0GM4C5GD,0.038067,0.596551,0.049456,0.99707


In [None]:
sub.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.398957,0.575294,0.971249,0.030159
1,03BMGTOK,0.99916,0.070718,0.118527,0.103383
2,03LZVFM6,0.998964,0.080317,0.12756,0.13361
3,0EPULUM5,0.998946,0.068862,0.113266,0.126298
4,0GM4C5GD,0.030975,0.845648,0.083442,0.988877


In [None]:
sub.to_csv('bert_full_samp_3epochs_2.csv', index=False)

In [None]:
cols = ['Depression', 'Alcohol', 'Suicide', 'Drugs']

In [None]:
dep = sub[sub['Depression'] > 0.98]
dep.shape

(113, 5)

In [None]:
#test_data.loc[dep.ID]

In [None]:
al = sub[sub['Alcohol'] > 0.94]
al.shape

(54, 5)

In [None]:
suicide = sub[sub['Suicide'] > 0.70]
suicide.shape

(10, 5)

In [None]:
drugs = sub[sub['Drugs'] > 0.90]
drugs.shape

(18, 5)

In [None]:
def pseudoLabel(df_sub, df_test, labels):
    dic = {}
    for label in labels:
        if label == 'Depression':
            lab = df_sub[df_sub[label] > 0.98]
            data = df_test.loc[lab.ID]
            data['label'] = label
            dic[label.lower()] = data.reset_index(drop=True)
        if label == 'Alcohol':
            lab = df_sub[df_sub[label] > 0.94]
            data = df_test.loc[lab.ID]
            data['label'] = label
            dic[label.lower()] = data.reset_index(drop=True)
        if label == 'Drugs':
            lab = df_sub[df_sub[label] > 0.90]
            data = df_test.loc[lab.ID]
            data['label'] = label
            dic[label.lower()] = data.reset_index(drop=True)
        if label == 'Suicide':
            lab = df_sub[df_sub[label] > 0.70]
            data = df_test.loc[lab.ID]
            data['label'] = label
            dic[label.lower()] = data.reset_index(drop=True)
    
    return shuffle(pd.concat(dic.values()).reset_index(drop=True))

In [None]:
test = test_data.copy()
test.index = test_data.ID

In [None]:
pseudoLabelData = pseudoLabel(sub, test, cols)

In [None]:
pseudoLabelData.to_csv('pseudoLabelData.csv', index=False)