<a href="https://colab.research.google.com/github/CIS6930-NLP/final_project/blob/main/CIS6930_GPT2_Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
%%capture
!pip install transformers
!pip install datasets

In [2]:
import tensorflow as tf
import re
import pandas as pd

from transformers import AutoTokenizer
from datasets import load_dataset

# Load Data

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
base_url = "/content/drive/MyDrive/CIS6930_NLP_Group_Project/Updated Data/"
dataset = load_dataset('csv', data_files={"train": base_url + "train.csv", "validation": base_url + "valid.csv", "test": base_url + "test.csv"})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1997639f0d244ea4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1997639f0d244ea4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset = dataset.remove_columns(['Unnamed: 0', 'conv_id', 'utterance_idx', 'speaker_idx', 'selfeval', 'tags'])

In [7]:
print(dataset['train'])

Dataset({
    features: ['context', 'prompt', 'utterance'],
    num_rows: 84167
})


# Preprocess

## Emotion Preprocessing

In [8]:
#emotion preprocessing
#group emotions
emotions = {}
emotions['excited'] = emotions['surprised'] = emotions['joyful'] = "excited"
emotions['afraid'] = emotions['terrified'] = emotions['anxious']= emotions['apprehensive']='afraid'
emotions['disgusted'] = emotions['embarrassed']= emotions['guilty'] = emotions['ashamed'] ="disgusted"
emotions['angry'] = emotions ['annoyed'] = emotions['jealous'] =emotions[ 'furious' ] = "annoyed"
emotions['faithful'] = emotions ['trusting']=emotions ['grateful']= emotions['caring'] = emotions['hopeful'] = "grateful"
emotions['sad'] = emotions['disappointed'] = emotions['devastated']= emotions ['lonely']=emotions['nostalgic']=emotions['sentimental'] = "disappointed"
emotions['proud']= emotions['impressed']= emotions['content'] = "impressed"
emotions['anticipating']=emotions[ 'prepared']=emotions ['confident'] = "prepared"
dicttt=emotions

In [9]:
dataset = dataset.map(lambda example: {"new_context": emotions[example["context"]]})

Map:   0%|          | 0/84167 [00:00<?, ? examples/s]

Map:   0%|          | 0/12077 [00:00<?, ? examples/s]

Map:   0%|          | 0/10973 [00:00<?, ? examples/s]

In [10]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['context', 'prompt', 'utterance', 'new_context'],
        num_rows: 84167
    })
    validation: Dataset({
        features: ['context', 'prompt', 'utterance', 'new_context'],
        num_rows: 12077
    })
    test: Dataset({
        features: ['context', 'prompt', 'utterance', 'new_context'],
        num_rows: 10973
    })
})


## Text Preprocessing

In [11]:
def preprocess_sentence(sentence):
  sentence = sentence.lower().strip()
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
  sentence = re.sub(r"i'm", "i am", sentence)
  sentence = re.sub(r"he's", "he is", sentence)
  sentence = re.sub(r"she's", "she is", sentence)
  sentence = re.sub(r"it's", "it is", sentence)
  sentence = re.sub(r"that's", "that is", sentence)
  sentence = re.sub(r"what's", "what is", sentence)
  sentence = re.sub(r"where's", "where is", sentence)
  sentence = re.sub(r"how's", "how is", sentence)
  sentence = re.sub(r"\'ll", " will", sentence)
  sentence = re.sub(r"\'ve", " have", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"\'d", " would", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"won't", "will not", sentence)
  sentence = re.sub(r"can't", "cannot", sentence)
  sentence = re.sub(r"n't", " not", sentence)
  sentence = re.sub(r"n'", "ng", sentence)
  sentence = re.sub(r"'bout", "about", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)
  sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
  sentence = sentence.strip()

  return sentence

In [12]:
def preprocess_example(example):
    example["context"] = preprocess_sentence(example["context"])
    example["prompt"] = preprocess_sentence(example["prompt"])
    example["utterance"] = preprocess_sentence(example["utterance"])
    example["new_context"] = preprocess_sentence(example["new_context"])
    return example

In [13]:
dataset = dataset.map(preprocess_example)

Map:   0%|          | 0/84167 [00:00<?, ? examples/s]

Map:   0%|          | 0/12077 [00:00<?, ? examples/s]

Map:   0%|          | 0/10973 [00:00<?, ? examples/s]

In [14]:
train_dataset = dataset['train']

## Tokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [18]:
tokenizer.is_fast

True

### Batch Iterator

In [16]:
batch_size = 1000

In [17]:
def batch_iterator():
    for i in range(0, len(train_dataset), batch_size):
        yield train_dataset[i : i + batch_size]['text']

### Concatenate Columns

In [19]:
def concat_columns(example):
    example["text"] = example["prompt"] + " " + example["utterance"] + " " + example["context"]
    return example

In [20]:
train_dataset = train_dataset.map(concat_columns)

Map:   0%|          | 0/84167 [00:00<?, ? examples/s]

In [21]:
all_texts = [train_dataset[i : i + batch_size]["text"] for i in range(0, len(train_dataset), batch_size)]

### Train new tokenizer

In [22]:
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=2**13)

# Save Tokenizer and Dataset

## Save Tokenizer

In [23]:
new_tokenizer.save_pretrained('/content/drive/MyDrive/CIS6930_NLP_Group_Project/gpt2_tokenizer')

('/content/drive/MyDrive/CIS6930_NLP_Group_Project/gpt2_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/gpt2_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/gpt2_tokenizer/vocab.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/gpt2_tokenizer/merges.txt',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/gpt2_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/gpt2_tokenizer/tokenizer.json')

## Save Dataset

In [24]:
dataset.save_to_disk('/content/drive/MyDrive/CIS6930_NLP_Group_Project/dataset')

Saving the dataset (0/1 shards):   0%|          | 0/84167 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12077 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10973 [00:00<?, ? examples/s]

## Push to hub

In [28]:
%%capture
!pip install huggingface_hub

In [29]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential he

In [30]:
dataset.push_to_hub("aegrif/CIS6930_DAAGR_Empathetic_Dialogues")



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/85 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
tokenizer.push_to_hub("aegrif/CIS6930_DAAGR_GPT2_TrainedTokenizer")

CommitInfo(commit_url='https://huggingface.co/aegrif/CIS6930_DAAGR_GPT2_TrainedTokenizer/commit/2a970f3c55dcefd6c269d01257ab6a9b9baa7ef1', commit_message='Upload tokenizer', commit_description='', oid='2a970f3c55dcefd6c269d01257ab6a9b9baa7ef1', pr_url=None, pr_revision=None, pr_num=None)