# Data Preprocessing

## Imports

In [1]:
import os
import random
import tarfile
import requests
import warnings
import pandas as pd
from datasets import logging as dlog
from transformers import logging as tlog
from transformers import RobertaTokenizer
from datasets import Features, Dataset
from datasets.load import load_dataset, load_dataset_builder, load_from_disk

In [2]:
global_seed = 100
random.seed(global_seed)
tlog.set_verbosity_error()
dlog.set_verbosity_error()
warnings.filterwarnings("ignore")

!hostname
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/scratch/as14229/envs_dirs/NLP/lib/

cm001.hpc.nyu.edu


## Download

### DBPedia

#### Download and Load

In [3]:
dataset_name = 'dbpedia_14'
cache_dir = './../.cache'
data_path= './../data'

data_dir = os.path.join(data_path, dataset_name)
os.makedirs(data_dir,exist_ok=True)

dataset = load_dataset(dataset_name, cache_dir=cache_dir, save_infos=True)
db_info = load_dataset_builder(dataset_name).info
train_set = dataset['train']
test_set = dataset['test']

  0%|          | 0/2 [00:00<?, ?it/s]

### Yahoo Answers

#### Download

In [4]:
dataset_name = 'yahoo_answers_10'
data_dir = os.path.join(data_path, dataset_name)

os.makedirs(data_dir,exist_ok=True)
csv_dir = os.path.join(data_dir,'csv')

if not os.path.exists(csv_dir):
    data_url = "https://docs.google.com/uc?id=0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU&amp\
        ;export=download&confirm=t&uuid=f25e7f13-9597-4e0b-9061-c9fe340eaa8e"
    tar_file = os.path.join(data_dir,'yahoo_answers_csv.tar.gz')
    # download
    with open(tar_file,'wb') as file:
        file.write(requests.get(data_url,allow_redirects=True).content)
    # extract
    with tarfile.open(tar_file) as file:
        file.extractall(data_dir)
    # delete
    os.remove(tar_file)
    # rename
    os.rename(os.path.join(data_dir,'yahoo_answers_csv'),csv_dir)

#### Load

In [26]:
data_header = ['label', 'question_title', 'question_content', 'content']
classes = ['Society & Culture', 'Science & Mathematics', 'Health', \
    'Education & Reference', 'Computers & Internet', 'Sports', 'Business & Finance', 'Entertainment & Music',\
    'Family & Relationships', 'Politics & Government']
feature_dict = {'label': {'names': classes, '_type':'ClassLabel', 'id':None},
    'question_title': {'dtype': 'string', '_type':'Value', 'id':None},
    'question_content': {'dtype': 'string', '_type':'Value', 'id':None},
    'content': {'dtype': 'string', '_type':'Value', 'id':None}
    }
features = Features.from_dict(feature_dict)

train_df = pd.read_csv(os.path.join(csv_dir,'train.csv'),names=data_header).dropna(axis=0,subset=['content']).reset_index(drop=True)
test_df = pd.read_csv(os.path.join(csv_dir,'test.csv'),names=data_header).dropna(axis=0,subset=['content']).reset_index(drop=True)

train_df.label -= 1
test_df.label -= 1

train_set = Dataset.from_pandas(train_df,features=features,cache_dir=data_dir)
test_set = Dataset.from_pandas(test_df,features=features,cache_dir=data_dir)

#### Sort

In [27]:
train_set=train_set.sort(column='label', keep_in_memory=True)
test_set=test_set.sort(column='label', keep_in_memory=True)

## Tokenize

### Define

In [37]:
MAX_LENGTH=128
checkpoint = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
def tokenize_function(examples):
    return tokenizer(examples["content"], padding="max_length", max_length=MAX_LENGTH ,truncation=True)

### Process


In [None]:
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_test_set = test_set.map(tokenize_function, batched=True)

tokenized_train_set = tokenized_train_set.rename_column("label","labels")
tokenized_train_set = tokenized_train_set.with_format("torch")

tokenized_test_set = tokenized_test_set.rename_column("label","labels")
tokenized_test_set = tokenized_test_set.with_format("torch")

### Save

In [33]:
tokenized_train_set.save_to_disk(data_dir+'/tokenized/train')
tokenized_test_set.save_to_disk(data_dir+'/tokenized/test')

## Test

### Load

In [5]:
tokenized_train_set = load_from_disk(data_dir+'/tokenized/train')
tokenized_test_set  = load_from_disk(data_dir+'/tokenized/test')

### Info

In [27]:
print(tokenized_train_set.features["labels"].names) # All label names ` 
print()
print(tokenized_train_set.features["labels"]._str2int) # Mapping from labels to integer
print()
print(tokenized_train_set)
print(tokenized_test_set)

['Society & Culture', 'Science & Mathematics', 'Health', 'Education & Reference', 'Computers & Internet', 'Sports', 'Business & Finance', 'Entertainment & Music', 'Family & Relationships', 'Politics & Government']

{'Society & Culture': 0, 'Science & Mathematics': 1, 'Health': 2, 'Education & Reference': 3, 'Computers & Internet': 4, 'Sports': 5, 'Business & Finance': 6, 'Entertainment & Music': 7, 'Family & Relationships': 8, 'Politics & Government': 9}

Dataset({
    features: ['labels', 'question_title', 'question_content', 'content', 'input_ids', 'attention_mask'],
    num_rows: 1375421
})
Dataset({
    features: ['labels', 'question_title', 'question_content', 'content', 'input_ids', 'attention_mask'],
    num_rows: 58966
})


### Distribution

In [28]:
def get_class_distribution(data):
    classes = [*range(tokenized_train_set.features["labels"].num_classes)]
    distribution = {clss:0 for clss in sorted(classes)}
    for val in data['labels'].numpy():
        distribution[val] += 1 
    return distribution

print(get_class_distribution(tokenized_train_set))
print(get_class_distribution(tokenized_test_set))

{0: 138700, 1: 139991, 2: 136996, 3: 137633, 4: 134149, 5: 139890, 6: 137916, 7: 137577, 8: 133902, 9: 138667}
{0: 5936, 1: 5999, 2: 5874, 3: 5910, 4: 5736, 5: 5996, 6: 5917, 7: 5897, 8: 5760, 9: 5941}
