# Stress Analysis in Social Media

In [4]:
import warnings
import random
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import torch

from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold

In [5]:
import logging, sys
logging.disable(sys.maxsize)


#Data Preprocessing
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

In [9]:
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3

warnings.filterwarnings('ignore')

f = open("best-results.txt", "a")

seed = 1256
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# %%

path = '../data/'
# path = '/content/Insight_Stress_Analysis/data/'
train = pd.read_csv(path + 'dreaddit-train.csv', encoding = "ISO-8859-1")
test = pd.read_csv(path + 'dreaddit-test.csv', encoding = "ISO-8859-1")

train['text'] = train['text'].astype(str)
test['text'] = test['text'].astype(str)

train_data = train[['text', 'label']]
test_data = test[['text', 'label']]
train_data['text'] = train_data['text'].astype(str)
test_data['text'] = test_data['text'].astype(str)
train_data['label'] = train_data['label'].astype(str)
test_data['label'] = test_data['label'].astype(str)

train_data.columns = ["text", "labels"]
test_data.columns = ["text", "labels"]
test_data.to_csv('test_l.csv', sep="\t",index = False)
train_data.to_csv('train_l.csv', sep="\t",index = False)


custom_args = {'fp16': False,  # not using mixed precision
               'train_batch_size': 1,
               'eval_batch_size': 1,
               'gradient_accumulation_steps': 5,
               'do_lower_case': True,
               'max_seq_length': 256,
               'learning_rate': LEARNING_RATE,  # using lower learning rate
               'overwrite_output_dir': True,  # important for CV
               "use_early_stopping": True,
               "early_stopping_delta": 0.01,
               "early_stopping_metric": "acc",
               "early_stopping_metric_minimize": False,
               "early_stopping_patience": 3,
               'num_train_epochs': NUM_TRAIN_EPOCHS,
               #"wandb_project": "bert-sarcasm",
               #"silent": True,
               "lazy_loading": True,
               "save_model_every_epoch": False,
               "save_eval_checkpoints": False
               }

roberta = ClassificationModel("bert", "bert-base-uncased",num_labels=2, args=custom_args, use_cuda=True)
#for t in train_data_batches:
roberta.train_model('train_l.csv')

predictions, _ = roberta.predict('test_l.csv')
f.write(str(accuracy_score(test_data['label'], predictions)))
f.write(str(f1_score(test_data['label'], predictions)))
f.close()

RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 6.00 GiB total capacity; 4.22 GiB already allocated; 78.56 MiB free; 4.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
torch.cuda.empty_cache()