# GPT2 Augmentation technique

### Libraries

In [1]:
# Connect drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd

import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Load model

In [None]:
# Load the GPT-2 model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

### Load set

In [None]:
train = pd.read_csv('/content/drive/MyDrive/NCU/Data Science and Machine Learning/Assignment 3/Datasets/train.csv')
rwtrain = train.drop(train.columns[0], axis=1)

In [None]:
rwtrain.head()

Unnamed: 0,TEXT,LABEL
0,director dirk shafer and co-writer greg hinton...,0
1,"a charming , quirky and leisurely paced scotti...",1
2,"the price was good , and came quickly though ...",1
3,i was looking forward to this game for a coupl...,0
4,arguably the year 's silliest and most incoher...,0


### Preprocessing

In [None]:
def preprocess_text(text):
    # convert text to lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # tokenize the text into words
    words = nltk.word_tokenize(text)
    # remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # lemmatize the remaining words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # join the words back into a single string
    text = " ".join(words)
    return text

In [None]:
rwtrain['TEXT'] = rwtrain['TEXT'].apply(preprocess_text)

In [None]:
rwtrain.head()

Unnamed: 0,TEXT,LABEL
0,director dirk shafer cowriter greg hinton ride...,0
1,charming quirky leisurely paced scottish comed...,1
2,price good came quickly though prime membership,1
3,looking forward game couple numextend,0
4,arguably year silliest incoherent movie,0


### Autmentation strategy

In [None]:
def generate_synthetic_samples(prompt, num_samples, temperature = 1.1):
    inputs = tokenizer.encode(prompt, return_tensors='pt', truncation=True, max_length=43)
    synthetic_samples = []
    print(f'Generating {num_samples} samples...')

    for _ in range(num_samples):
        outputs = model.generate(inputs, max_length=43, pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature = temperature)
        synthetic_sample = tokenizer.decode(outputs[0], skip_special_tokens=True)
        synthetic_samples.append(synthetic_sample)

    return synthetic_samples

### Generating data

In [None]:
# 97' with 2 augmentations with CPU
# 207' with 4 augmentations with CPU
# Generate synthetic samples
augmentation_factor = 2  # Set the desired number of synthetic samples per original sample
augmented_data = []
for index, row in rwtrain.iterrows():
    original_sample = row['TEXT']  # Assuming 'TEXT' is the column name for sentences
    original_label = row['LABEL']  # Assuming 'LABEL' is the column name for labels

    print(f'Processing row index: {index}...')

    synthetic_samples = generate_synthetic_samples(original_sample, augmentation_factor)
    augmented_data.append({'TEXT': original_sample, 'LABEL': original_label})
    
    for synthetic_sample in synthetic_samples:
        augmented_data.append({'TEXT': synthetic_sample, 'LABEL': original_label})


### Constructing augmented dataset

In [None]:
# Combine original and synthetic data
augmented_dataset = pd.DataFrame(augmented_data)

In [None]:
# Save the augmented dataset as a CSV file
augmented_dataset.to_csv('2T11_GPT_train.csv', index=False)