# Imports

## Install Packages

In [None]:
%%capture
!pip install transformers
!pip install datasets

## Import packages

In [None]:
import tensorflow as tf
import re
import pandas as pd

from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, Features

## Import and mount google drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Load Data

In [None]:
# Load dataset from Huggingface hub
dataset = load_dataset('aegrif/CIS6930_DAAGR_Empathetic_Dialogues')

Downloading readme:   0%|          | 0.00/750 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/aegrif___parquet/aegrif--CIS6930_DAAGR_Empathetic_Dialogues-5ad2fb5e4e0762af/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/10973 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12077 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/84167 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/aegrif___parquet/aegrif--CIS6930_DAAGR_Empathetic_Dialogues-5ad2fb5e4e0762af/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Assign the "train" subset of the preprocessed dataset to a new variable
train_dataset = dataset['train']

# Tokenizer (GPT-2)

In [None]:
# Instantiate a tokenizer using the AutoTokenizer class from transformers
tokenizer = AutoTokenizer.from_pretrained("gpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Check whether the tokenizer is a "fast" tokenizer
tokenizer.is_fast

True

### Batch Iterator

In [None]:
batch_size = 1000

In [None]:
# Define a generator to iterate over the training dataset in batches
def batch_iterator():
    for i in range(0, len(train_dataset), batch_size):
        yield train_dataset[i : i + batch_size]['text']

### Concatenate Columns

In [None]:
# Concatenate the "prompt", "utterance", and "context" columns of each example in the training dataset
def concat_columns(example):
    example["text"] = example["prompt"] + " " + example["previous_utterance"] + " " + example["context"] + " " + example["utterance"] 
    return example

In [None]:
# Apply concat_columns() to the entire "train" subset of the preprocessed dataset
train_dataset = train_dataset.map(concat_columns)

Map:   0%|          | 0/84167 [00:00<?, ? examples/s]

In [None]:
# Create a list of batches of texts using the generator defined earlier
all_texts = [train_dataset[i : i + batch_size]["text"] for i in range(0, len(train_dataset), batch_size)]

# Train

In [None]:
# Train a new tokenizer using the list of batches of texts
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=2**13)

# Save Tokenizer

## Local

In [None]:
# Save the new tokenizer to Google Drive
new_tokenizer.save_pretrained('/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/GPT2_Tokenizer')

('/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/GPT2_Tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/GPT2_Tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/GPT2_Tokenizer/vocab.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/GPT2_Tokenizer/merges.txt',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/GPT2_Tokenizer/added_tokens.json',
 '/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/GPT2_Tokenizer/tokenizer.json')

## Save Dataset

## Push to Huggingface hub
*Requires access token

In [None]:
%%capture
!pip install huggingface_hub

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
tokenizer.push_to_hub("aegrif/CIS6930_DAAGR_GPT2_TrainedTokenizer")

CommitInfo(commit_url='https://huggingface.co/aegrif/CIS6930_DAAGR_GPT2_TrainedTokenizer/commit/36df0cf1499915055d23f783c666d420af889dd8', commit_message='Upload tokenizer', commit_description='', oid='36df0cf1499915055d23f783c666d420af889dd8', pr_url=None, pr_revision=None, pr_num=None)