# Text Polarity Classification

## Import and preprocessing

In [1]:
# Connect drive
from google.colab import drive
drive.mount('/content/drive')

# Text preprocessing and imports
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import tensorflow as tf
import torch

# Data augmentation
!pip install nlpaug
import nlpaug.augmenter.word as naw

# Finetuning

!pip install transformers
!pip install simpletransformers
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from simpletransformers.classification import ClassificationModel, ClassificationArgs

Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading hugging

### GPU Colab

In [2]:
# Check if a GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [3]:
!nvidia-smi

Sun May  7 15:21:43 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    11W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Upload training and test

In [14]:
train = pd.read_csv('/content/drive/MyDrive/NCU/Data Science and Machine Learning/Assignment 3/Datasets/train.csv')
train = train.drop(train.columns[0], axis=1)
valid = pd.read_csv('/content/drive/MyDrive/NCU/Data Science and Machine Learning/Assignment 3/Datasets/test.csv')
valid = valid.drop(valid.columns[0], axis=1)

In [5]:
train.head()

Unnamed: 0,TEXT,LABEL
0,director dirk shafer and co-writer greg hinton...,0
1,"a charming , quirky and leisurely paced scotti...",1
2,"the price was good , and came quickly though ...",1
3,i was looking forward to this game for a coupl...,0
4,arguably the year 's silliest and most incoher...,0


### Augmented Dataset

In [6]:
aug_train = pd.read_csv('/content/drive/MyDrive/NCU/Data Science and Machine Learning/Assignment 3/Datasets/GPT2_train.csv')
#aug_train = valid.drop(valid.columns[0], axis=1)

- Shuffle for GPT-2 set only!

In [7]:
to_be_shuffled = aug_train

In [8]:
shuffled = to_be_shuffled.sample(frac=1).reset_index(drop=True)

- Set GPT2 as the training set for the model

In [9]:
train = shuffled

### Preprocessing (stopwords and punctuation removal, lemmatizing)

- Preprocess text

In [15]:
def preprocess_text(text):
    # convert text to lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # tokenize the text into words
    words = nltk.word_tokenize(text)
    # remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # lemmatize the remaining words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # join the words back into a single string
    text = " ".join(words)
    return text

- Data augmentation

In [16]:
def augment_data(df, n_augmentations):
    augmented = pd.DataFrame(columns=df.columns)
    augmenter = naw.SynonymAug(aug_src='wordnet')

    for i, row in df.iterrows():
        text = row['TEXT']
        for j in range(n_augmentations):
            augmented_text = augmenter.augment(text)
            if isinstance(augmented_text, list):
                augmented_text = ' '.join(augmented_text)
            augmented.loc[len(augmented)] = [augmented_text, row['LABEL']]

    print(f'Augmentation number: {n_augmentations}')
    
    return pd.concat([df, augmented]).reset_index(drop=True)


- Apply augmentation and preprocess

In [17]:
def text_valid_prepr(train_df, valid_df, n_augmentations=0):

    # Increase training data
    print(f'Increasing training set {n_augmentations} times...')

    train_aug = augment_data(train_df, n_augmentations)

    # Process text sentences
    print(f'The training set has now {len(train_aug["TEXT"])} rows')
    print('Preprocessing text...')

    train_aug['TEXT'] = train_aug['TEXT'].apply(preprocess_text)

    print('Preprocessing the validation data...')

    valid_df = [sent for sent in valid_df['TEXT']]

    print('Preprocessing finished!')

    return train_aug, valid_df

train, valid = text_valid_prepr(train, valid, 2)

Increasing training set 2 times...
Augmentation number: 2
The training set has now 6000 rows
Preprocessing text...
Preprocessing the validation data...
Preprocessing finished!


- Shuffle the augmented basic training set

In [18]:
train = train.sample(frac=1).reset_index(drop=True)

- Calculating max_length

In [19]:
def max_length(train_df):

    train_list = [sent for sent in train['TEXT']]
    
    token_list = []
    max_length = 0

    for sent in train_list:
      sentence = nltk.word_tokenize(sent)
      count = len(sentence)

      if max_length < count:
        max_length = count

    return max_length

print(max_length(train))

30


## Finetuning RoBERTa/BERT

### Soft prompting

- Single

In [20]:
# Define the prompt and prompt_token

#prompt = "Please rate the sentiment of the following sentence as positive or negative:"
prompts = "Your task is to determine if the following sentence has a positive or negative sentiment:"
prompt_token = "<s>"

Multiple

In [None]:
prompts = [
    "Please rate the sentiment of the following sentence as positive or negative:",
    "Is the sentiment of the following sentence positive or negative?",
    "Does the following sentence express positive or negative sentiment?",
    "Your task is to determine if the following sentence has a positive or negative sentiment.",
    "Is the following sentence conveying a positive or negative tone?",
    "Please classify the sentiment of the following sentence as positive or negative:",
    "Based on the following sentence, is the sentiment positive or negative?"
]

### Change hyperparameters of the transformer

In [21]:
trainer_args = {
   'model_type':  'roberta',
   'model_name': 'roberta-large',
   'output_dir': 'outputs/',
   'cache_dir': 'cache/',
   'fp16': True,
   'fp16_opt_level': 'O1',
   'max_seq_length': 64,
   'train_batch_size': 32,
   'eval_batch_size':8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 7,
   'weight_decay': 0,
   'learning_rate': 2e-5,
   'adam_epsilon': 1e-8, #0.01 = 1 * 10^-2 = 1e-2
   'warmup_ratio': 0.08,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,
   'logging_steps': 50,
   'evaluate_during_training': False,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,
   'overwrite_output_dir': True,
   'reprocess_input_data': False,
}

### Train

In [22]:
def train_eval(train_df, valid_df, n_submission):

  model = ClassificationModel(
      'roberta', 
      'roberta-large', 
      args = trainer_args,
      )
  
  print("training the model...")

  model.train_model(train_df, prompt = prompts,
      prompt_token = prompt_token)

  #model.train_model(train_df)

  print("model trained!")
  print(" ")
  print("running predictions...")
  
  predictions, raw_outputs = model.predict(valid_df)

  submission_csv = pd.DataFrame(predictions, columns = ['LABEL'])

  submission_csv.to_csv (f'submission_{n_submission}.csv', index=True, header=True)

In [23]:
submission = 'sh_2aug_RB_9_7'
train_eval(train, valid, submission)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifie

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

training the model...




  0%|          | 0/6000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Running Epoch 0 of 7:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 1 of 7:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 2 of 7:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 3 of 7:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 4 of 7:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 5 of 7:   0%|          | 0/188 [00:00<?, ?it/s]

Running Epoch 6 of 7:   0%|          | 0/188 [00:00<?, ?it/s]

model trained!
 
running predictions...


  0%|          | 0/11000 [00:00<?, ?it/s]

  0%|          | 0/1375 [00:00<?, ?it/s]