# Data Preprocessing

## Imports

In [1]:
import os
import random
import tarfile
import requests
import warnings
import pandas as pd
from datasets import logging as dlog
from transformers import logging as tlog
from transformers import RobertaTokenizer
from datasets import Features, Dataset
from datasets.load import load_dataset, load_dataset_builder, load_from_disk

In [2]:
data_path = './../data'
cache_path = './../.cache'

global_seed = 100
random.seed(global_seed)
tlog.set_verbosity_error()
dlog.set_verbosity_error()
warnings.filterwarnings("ignore")

!hostname
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/scratch/as14229/envs_dirs/NLP/lib/

gr024.hpc.nyu.edu


## <br>

---

## Process

### Tokenizer Setup


In [9]:
MAX_LENGTH=128
checkpoint = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)

content_key = 'content'

def tokenize_function(examples):
    return tokenizer(examples[content_key], padding="max_length", max_length=MAX_LENGTH ,truncation=True)

def tokenize_dataset(data_dir,train_set,test_set,key='content'):
    # Content key
    global content_key
    content_key = key

    # Sort
    train_set = train_set.sort(column='label')
    test_set = test_set.sort(column='label')
    
    # Tokenize
    tokenized_train_set = train_set.map(tokenize_function, batched=True)
    tokenized_test_set = test_set.map(tokenize_function, batched=True)

    # Format
    tokenized_train_set = tokenized_train_set.rename_column("label","labels")
    tokenized_test_set = tokenized_test_set.rename_column("label","labels")
    
    if content_key!='content':
        tokenized_train_set = tokenized_train_set.rename_column(content_key,"content")
        tokenized_test_set = tokenized_test_set.rename_column(content_key,"content")
    
    tokenized_train_set = tokenized_train_set.with_format("torch")
    tokenized_test_set = tokenized_test_set.with_format("torch")

    # Save
    tokenized_train_set.save_to_disk(data_dir+'/tokenized/train')
    tokenized_test_set.save_to_disk(data_dir+'/tokenized/test')

### DBpedia

#### Download and Load

In [7]:
dataset_name = 'dbpedia_14'

data_dir = os.path.join(data_path, dataset_name)
os.makedirs(data_dir,exist_ok=True)

dataset = load_dataset(dataset_name, cache_dir=cache_path, save_infos=True)
db_info = load_dataset_builder(dataset_name).info

train_set = dataset['train']
test_set = dataset['test']

  0%|          | 0/2 [00:00<?, ?it/s]

#### Tokenize

In [None]:
tokenize_dataset(data_dir,train_set,test_set)

### Yahoo Answers

#### Download

In [8]:
dataset_name = 'yahoo_answers'

data_dir = os.path.join(data_path, dataset_name)

os.makedirs(data_dir,exist_ok=True)
csv_dir = os.path.join(data_dir,'csv')

if not os.path.exists(csv_dir):
    data_url = "https://docs.google.com/uc?id=0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU&amp\
        ;export=download&confirm=t&uuid=f25e7f13-9597-4e0b-9061-c9fe340eaa8e"
    tar_file = os.path.join(data_dir,'yahoo_answers_csv.tar.gz')
    # download
    with open(tar_file,'wb') as file:
        file.write(requests.get(data_url,allow_redirects=True).content)
    # extract
    with tarfile.open(tar_file) as file:
        file.extractall(data_dir)
    # delete
    os.remove(tar_file)
    # rename
    os.rename(os.path.join(data_dir,'yahoo_answers_csv'),csv_dir)

#### Load

In [10]:
data_header = ['label', 'question_title', 'question_content', 'content']
classes = ['Society & Culture', 'Science & Mathematics', 'Health', \
    'Education & Reference', 'Computers & Internet', 'Sports', 'Business & Finance', 'Entertainment & Music',\
    'Family & Relationships', 'Politics & Government']
feature_dict = {'label': {'names': classes, '_type':'ClassLabel', 'id':None},
    'question_title': {'dtype': 'string', '_type':'Value', 'id':None},
    'question_content': {'dtype': 'string', '_type':'Value', 'id':None},
    'content': {'dtype': 'string', '_type':'Value', 'id':None}
    }
features = Features.from_dict(feature_dict)

train_df = pd.read_csv(os.path.join(csv_dir,'train.csv'),names=data_header).dropna(axis=0,subset=['content']).reset_index(drop=True)
test_df = pd.read_csv(os.path.join(csv_dir,'test.csv'),names=data_header).dropna(axis=0,subset=['content']).reset_index(drop=True)

train_df.label -= 1
test_df.label -= 1

train_set = Dataset.from_pandas(train_df,features=features)
test_set = Dataset.from_pandas(test_df,features=features)

#### Tokenize

In [None]:
tokenize_dataset(data_dir,train_set,test_set)

### AG News

#### Download and Load

In [10]:
dataset_name = 'ag_news'

data_dir = os.path.join(data_path, dataset_name)
os.makedirs(data_dir,exist_ok=True)

dataset = load_dataset(dataset_name, cache_dir=cache_path, save_infos=True)
db_info = load_dataset_builder(dataset_name).info

train_set = dataset['train']
test_set = dataset['test']

  0%|          | 0/2 [00:00<?, ?it/s]

#### Tokenize

In [11]:
tokenize_dataset(data_dir,train_set,test_set,key='text')

  0%|          | 0/120 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

### Yelp Reviews

#### Download and Load

In [12]:
dataset_name = 'yelp_review_full'

data_dir = os.path.join(data_path, dataset_name)
os.makedirs(data_dir,exist_ok=True)

dataset = load_dataset(dataset_name, cache_dir=cache_path, save_infos=True)
db_info = load_dataset_builder(dataset_name).info
train_set = dataset['train']
test_set = dataset['test']

  0%|          | 0/2 [00:00<?, ?it/s]

#### Tokenize

In [13]:
tokenize_dataset(data_dir,train_set,test_set,key='text')

  0%|          | 0/650 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

## <br>

---

## Test

### Load

In [3]:
dataset_name = 'yahoo_answers'
data_dir = os.path.join(data_path,dataset_name)

tokenized_train_set = load_from_disk(data_dir+'/tokenized/train')
tokenized_test_set  = load_from_disk(data_dir+'/tokenized/test')

In [6]:
tokenized_train_set.info

DatasetInfo(description='', citation='', homepage='', license='', features={'labels': ClassLabel(names=['Society & Culture', 'Science & Mathematics', 'Health', 'Education & Reference', 'Computers & Internet', 'Sports', 'Business & Finance', 'Entertainment & Music', 'Family & Relationships', 'Politics & Government'], id=None), 'question_title': Value(dtype='string', id=None), 'question_content': Value(dtype='string', id=None), 'content': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

### Info

In [4]:
print('Dataset:',dataset_name)
print('\n',tokenized_train_set.features["labels"].names) # All label names ` 
print()
print(tokenized_train_set.features["labels"]._str2int) # Mapping from labels to integer
print()
print(tokenized_test_set)
print(tokenized_train_set)

Dataset: yahoo_answers

 ['Society & Culture', 'Science & Mathematics', 'Health', 'Education & Reference', 'Computers & Internet', 'Sports', 'Business & Finance', 'Entertainment & Music', 'Family & Relationships', 'Politics & Government']

{'Society & Culture': 0, 'Science & Mathematics': 1, 'Health': 2, 'Education & Reference': 3, 'Computers & Internet': 4, 'Sports': 5, 'Business & Finance': 6, 'Entertainment & Music': 7, 'Family & Relationships': 8, 'Politics & Government': 9}

Dataset({
    features: ['labels', 'question_title', 'question_content', 'content', 'input_ids', 'attention_mask'],
    num_rows: 58966
})
Dataset({
    features: ['labels', 'question_title', 'question_content', 'content', 'input_ids', 'attention_mask'],
    num_rows: 1375421
})


### Distribution

In [5]:
def get_class_distribution(data):
    classes = [*range(tokenized_train_set.features["labels"].num_classes)]
    distribution = {clss:0 for clss in sorted(classes)}
    for val in data['labels'].numpy():
        distribution[val] += 1 
    return distribution

print(get_class_distribution(tokenized_test_set))
print(get_class_distribution(tokenized_train_set))

{0: 5936, 1: 5999, 2: 5874, 3: 5910, 4: 5736, 5: 5996, 6: 5917, 7: 5897, 8: 5760, 9: 5941}
{0: 138700, 1: 139991, 2: 136996, 3: 137633, 4: 134149, 5: 139890, 6: 137916, 7: 137577, 8: 133902, 9: 138667}


## <br>

---

## Debug

In [3]:
import torch
from datasets import concatenate_datasets

In [5]:
columns_to_keep = {'input_ids', 'attention_mask'}

data_dir = './../data'
other_datasets = ['yelp_review_full','ag_news']       

datasets = []
for dataset_name in other_datasets:
    dataset = load_from_disk(os.path.join(data_dir,dataset_name,'tokenized/train'))
    labels = dataset['labels']
    content = dataset['content']
    dataset = dataset.remove_columns(list(set(dataset.features.keys())-columns_to_keep))
    dataset = dataset.add_column('labels', labels.tolist())
    dataset = dataset.add_column('content', content)
    datasets.append(dataset)

other_data = concatenate_datasets(datasets)

In [6]:
other_data.column_names

['input_ids', 'attention_mask', 'labels', 'content']

In [12]:
print(datasets[0].shape[0] + datasets[1].shape[0], other_data.shape[0])

770000 770000


In [15]:
targets = torch.tensor([[1],[2],[3],[4]])
preds = torch.tensor([1,3,3,5])

In [16]:
wrong_idx = (preds != targets.view_as(preds)).nonzero().flatten()

In [20]:
dataset = load_from_disk(os.path.join('./../data/','yahoo_answers','tokenized/train'))

In [13]:
(preds != targets.view_as(preds)).logical_not()

tensor([ True, False,  True, False])

In [14]:
(preds != targets.view_as(preds))

tensor([False,  True, False,  True])

In [21]:
dataset.select(wrong_idx)

Dataset({
    features: ['labels', 'question_title', 'question_content', 'content', 'input_ids', 'attention_mask'],
    num_rows: 2
})

In [29]:
dataset.select([150000,0]).sort(column='labels')[:]

{'labels': tensor([0, 1]),
 'question_title': ['Which race is considered the Majority now? Blacks? Hispanics? or Whites?',
  'Try to solve this economical problem....and math also?'],
 'question_content': [None,
  'If we have corporation that produces some product and we are given informations on the size of costs depending of production level\\n\\nProduction (x)   1   ;   4   ;   5   ;   7   ; 10\\nCosts              2.5  ;  6.5 ;  7.8 ; 10.6 ; 15.6\\n\\n1) Define the function of costs if it has the form of\\n a x^2 + bx + c\\n\\n2) Find the production level for which we have the minimal average costs.\\n\\n3)  Define elasticity (E)  of total costs   and  elasticity of average costs \\n\\nTry, it will help me a lot'],
 'content': ['According to the 2005 CIA World Fact (an official statistics agency), America racial composition is:\\n\\n    * White\\n          o 81.7%, or 241 million (includes those who declared themselves as white-Hispanics; those of Middle Eastern and North African d

In [30]:
correct_samples_cw_idx = {0:[1,2], 1:[1,2], 2:[1,2]}
wrong_samples_cw_idx = {0:[1,2], 1:[1,2], 2:[1,2]}

In [31]:
{cr_key:len(cr_val)/(len(cr_val)+len(wr_val)) for (cr_key, cr_val),(_, wr_val) in zip(correct_samples_cw_idx.items(), wrong_samples_cw_idx.items())}

{0: 0.5, 1: 0.5, 2: 0.5}

In [35]:
[*range(10)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [9]:
open('./test.log','w')

<_io.TextIOWrapper name='./test.log' mode='w' encoding='UTF-8'>

In [38]:
a = [1,2,3]
torch.save(a,'./a.pt')

In [39]:
b = torch.load('./a.pt')

In [40]:
b

[1, 2, 3]

In [11]:
import logging
from tqdm import trange
from time import sleep
from tqdm.contrib.logging import logging_redirect_tqdm

LOG = logging.getLogger(__name__)

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    with logging_redirect_tqdm():
        for i in trange(9):
            sleep(1)
            LOG.info("console logging redirected to `tqdm.write()`")

INFO:__main__:console logging redirected to `tqdm.write()`
INFO:__main__:console logging redirected to `tqdm.write()`
INFO:__main__:console logging redirected to `tqdm.write()`
INFO:__main__:console logging redirected to `tqdm.write()`
INFO:__main__:console logging redirected to `tqdm.write()`
INFO:__main__:console logging redirected to `tqdm.write()`
INFO:__main__:console logging redirected to `tqdm.write()`
INFO:__main__:console logging redirected to `tqdm.write()`
INFO:__main__:console logging redirected to `tqdm.write()`
100%|██████████| 9/9 [00:09<00:00,  1.01s/it]
