# Imports

## Install Packages

In [1]:
%%capture
!pip install datasets

## Import packages

In [2]:
import re
import pandas as pd

from datasets import load_dataset, Dataset, Features

## Import and mount google drive

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Load Data

In [73]:
# Define the base directory where the dataset files are stored
base_url = "/content/drive/MyDrive/CIS6930_NLP_Group_Project/Updated Data/"

# Load the dataset from CSV files using the load_dataset function from the datasets package
dataset = load_dataset('csv', data_files={"train": base_url + "train.csv", "validation": base_url + "valid.csv", "test": base_url + "test.csv"})



  0%|          | 0/3 [00:00<?, ?it/s]

In [74]:
print(dataset.column_names)
print(dataset['train'].features)
print(dataset['train'].column_names)

{'train': ['Unnamed: 0', 'conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'], 'validation': ['Unnamed: 0', 'conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'], 'test': ['Unnamed: 0', 'conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags']}
{'Unnamed: 0': Value(dtype='int64', id=None), 'conv_id': Value(dtype='string', id=None), 'utterance_idx': Value(dtype='int64', id=None), 'context': Value(dtype='string', id=None), 'prompt': Value(dtype='string', id=None), 'speaker_idx': Value(dtype='int64', id=None), 'utterance': Value(dtype='string', id=None), 'selfeval': Value(dtype='string', id=None), 'tags': Value(dtype='string', id=None)}
['Unnamed: 0', 'conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags']


In [75]:
print(dataset['train'][46925])

{'Unnamed: 0': 46927, 'conv_id': 'hit:6841_conv:13682', 'utterance_idx': 1, 'context': 'ashamed', 'prompt': 'I am not happy with the way my kids behaved this summer. They were so lazy', 'speaker_idx': 27, 'utterance': '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                My kids were so lazy this summer', 'selfeval': '5|5|5_5|5|5', 'tags': None}


# Preprocess

## Emotion Preprocessing

In [76]:
# Define a dictionary to group different emotions into categori
emotions = {}
emotions['excited'] = emotions['surprised'] = emotions['joyful'] = "excited"
emotions['afraid'] = emotions['terrified'] = emotions['anxious']= emotions['apprehensive']='afraid'
emotions['disgusted'] = emotions['embarrassed']= emotions['guilty'] = emotions['ashamed'] ="disgusted"
emotions['angry'] = emotions ['annoyed'] = emotions['jealous'] =emotions[ 'furious' ] = "annoyed"
emotions['faithful'] = emotions ['trusting']=emotions ['grateful']= emotions['caring'] = emotions['hopeful'] = "grateful"
emotions['sad'] = emotions['disappointed'] = emotions['devastated']= emotions ['lonely']=emotions['nostalgic']=emotions['sentimental'] = "disappointed"
emotions['proud']= emotions['impressed']= emotions['content'] = "impressed"
emotions['anticipating']=emotions[ 'prepared']=emotions ['confident'] = "prepared"
dicttt=emotions

In [77]:
# Map each example's "context" attribute to a new emotion category based on the "emotions" dictionary
dataset = dataset.map(lambda example: {"new_context": emotions[example["context"]]})



In [78]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags', 'new_context'],
        num_rows: 84167
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags', 'new_context'],
        num_rows: 12077
    })
    test: Dataset({
        features: ['Unnamed: 0', 'conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags', 'new_context'],
        num_rows: 10973
    })
})


In [79]:
# Define a function that cleans the text by replacing '_comma_' with ',' and '_quote_' with "'"
def clean_text(text):
    if isinstance(text, list):
        text = text[0]
    return text.replace('_comma_', ',').replace('_quote_', "'")

# Define a function that cleans the "prompt", "utterance", and "previous_utterance" columns of a batch using clean_text()
def clean_columns(batch):
    return {
        'prompt': [clean_text(text) for text in batch['prompt']],
        'utterance': [clean_text(text) for text in batch['utterance']]
    }
# Loop over the train, validation, and test splits of the dataset
for split in ['train', 'validation', 'test']:
  # Apply the clean_columns() function to each batch of examples in the split
    dataset[split] = dataset[split].map(clean_columns, batched=True)

Map:   0%|          | 0/84167 [00:00<?, ? examples/s]

Map:   0%|          | 0/12077 [00:00<?, ? examples/s]

Map:   0%|          | 0/10973 [00:00<?, ? examples/s]

In [80]:
print(dataset['train'][46925])

{'Unnamed: 0': 46927, 'conv_id': 'hit:6841_conv:13682', 'utterance_idx': 1, 'context': 'ashamed', 'prompt': 'I am not happy with the way my kids behaved this summer. They were so lazy', 'speaker_idx': 27, 'utterance': '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                My kids were so lazy this summer', 'selfeval': '5|5|5_5|5|5', 'tags': None, 'new_context': 'disgusted'}


## Text Preprocessing

In [63]:
# Preprocess each sentence using regular expressions and replace contractions and abbreviations
def preprocess_sentence(sentence):
  sentence = sentence.lower().strip()
  sentence = re.sub(r"i'm", "i am", sentence)
  sentence = re.sub(r"he's", "he is", sentence)
  sentence = re.sub(r"she's", "she is", sentence)
  sentence = re.sub(r"it's", "it is", sentence)
  sentence = re.sub(r"that's", "that is", sentence)
  sentence = re.sub(r"what's", "what is", sentence)
  sentence = re.sub(r"where's", "where is", sentence)
  sentence = re.sub(r"how's", "how is", sentence)
  sentence = re.sub(r"\'ll", " will", sentence)
  sentence = re.sub(r"\'ve", " have", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"\'d", " would", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"won't", "will not", sentence)
  sentence = re.sub(r"can't", "cannot", sentence)
  sentence = re.sub(r"n't", " not", sentence)
  sentence = re.sub(r"n'", "ng", sentence)
  sentence = re.sub(r"'bout", "about", sentence)
  sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
  sentence = re.sub(r'[" "]+', " ", sentence)

  sentence = sentence.strip()

  return sentence

In [81]:
# Preprocess each example in the dataset by applying preprocess_sentence() to relevant attributes
def preprocess_example(example):
    example["context"] = preprocess_sentence(example["context"])
    example["prompt"] = preprocess_sentence(example["prompt"])
    example["utterance"] = preprocess_sentence(example["utterance"])
    example["new_context"] = preprocess_sentence(example["new_context"])
    return example

In [82]:
# Apply preprocess_example() to the entire dataset
dataset = dataset.map(preprocess_example)

Map:   0%|          | 0/84167 [00:00<?, ? examples/s]

Map:   0%|          | 0/12077 [00:00<?, ? examples/s]

Map:   0%|          | 0/10973 [00:00<?, ? examples/s]

In [85]:
print(dataset['train'][29703])

{'Unnamed: 0': 29705, 'conv_id': 'hit:4330_conv:8660', 'utterance_idx': 5, 'context': 'sad', 'prompt': 'life can be really not a happy place. it pains me that so many do not love or care and hurt others.', 'speaker_idx': 296, 'utterance': 'thank you! i have tried..it is been a very traumatic year in so many ways..and i did not think it could get any worse a few years ago..wrong! i hear you with pain! please try every treatment you can. diet plays a huge role also chiropractor, nuerofeedback, different herbs! i found eating right, exercising, herbs, vitamins, organic food helped with my pain caused by doctors i had tons of neuropathy, pain and fibro like symptoms in addition to the trauma of deaths, abuse etc. i hope you feel better and you inspire me right now that i do not have it so bad!!!', 'selfeval': '5|5|5_5|5|5', 'tags': None, 'new_context': 'disappointed'}


In [86]:
# Import the Dataset class from the datasets package
from datasets import Dataset

# Define a function that adds a "previous_utterance" column to each example in the dataset,
# containing the text of the previous utterance in the same conversation.
def add_previous_utterance(example, prev_example):
    if example['conv_id'] == prev_example['conv_id'] and example['utterance_idx'] == prev_example['utterance_idx'] + 1:
        return prev_example['utterance']
    else:
        return '<|start|>'

# Define a function that adds a "previous_utterance" column to the entire dataset
def add_previous_utterance_column(dataset):
    new_column = []
    prev_example = None
    for example in dataset:
        if prev_example is not None:
            new_column.append(add_previous_utterance(example, prev_example))
        else:
            new_column.append('<|start|>')
        prev_example = example
    return new_column

# Loop over the train, validation, and test splits of the dataset
for split in ['train', 'validation', 'test']:
  # Remove unwanted columns from the split
    dataset[split] = dataset[split].remove_columns(['Unnamed: 0', 'speaker_idx', 'selfeval', 'tags'])
    
    # Add a "previous_utterance" column to the split using the add_previous_utterance_column() function
    prev_utterance_col = add_previous_utterance_column(dataset[split])
    dataset[split] = dataset[split].add_column('previous_utterance', prev_utterance_col)

In [87]:
print(dataset['train'][29704])

{'conv_id': 'hit:4330_conv:8660', 'utterance_idx': 6, 'context': 'sad', 'prompt': 'life can be really not a happy place. it pains me that so many do not love or care and hurt others.', 'utterance': 'that is exactly what i have been going through along with horrible doctors that did nothing for me. i just went high nutrient vegan months ago and am improving my leaps and bounds. i follow dr. joel furhman. you can keep going and your journey is giving you compassion for others that are going through horrible things. take heart and i hope you feel better.', 'new_context': 'disappointed', 'previous_utterance': 'thank you! i have tried..it is been a very traumatic year in so many ways..and i did not think it could get any worse a few years ago..wrong! i hear you with pain! please try every treatment you can. diet plays a huge role also chiropractor, nuerofeedback, different herbs! i found eating right, exercising, herbs, vitamins, organic food helped with my pain caused by doctors i had tons

# Save Dataset

In [88]:
# Save the preprocessed dataset to Google Drive
dataset.save_to_disk('/content/drive/MyDrive/CIS6930_NLP_Group_Project/Final Code Files/dataset')

Saving the dataset (0/1 shards):   0%|          | 0/84167 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12077 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10973 [00:00<?, ? examples/s]

## Push to Huggingface hub
*Requires access token

In [None]:
%%capture
!pip install huggingface_hub

In [61]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [89]:
dataset.push_to_hub("aegrif/CIS6930_DAAGR_Empathetic_Dialogues")



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/85 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/750 [00:00<?, ?B/s]