# Data parsing
This notebook contains examples regarding parsing the XML datasets.

## Set up
To use the custom modules defined in `src`, we first make sure that the working directory is the root folder.

In [None]:
import os

root_folder = 'transformers-on-a-diet'
while not os.getcwd().endswith(root_folder):
    os.chdir('../')

## Manual usage
You can use the functions of the `data` module manually for most freedom:

In [None]:
import numpy as np

from src.data import parse, balance, add_unlabeled, to_dataset

# Parse a DataFrame without using validation data
df = parse('mams', 'data/mams/train.xml')

# Parse a Dataframe using validation data
df_train, df_validation = parse('mams', 'data/mams/train.xml', validation_split=0.2)

# Quickly balance the training dataset
df_balanced = balance(df_train)

# Manually set the balancing settings
df_balanced = balance(df_train, balance_by='label', balance_method=np.max)

# First parse a dataset to use as unlabeled data
df_unlabeled = parse('semeval14', 'data/mams/train.xml')

# Then add to the dataset
df_combined = add_unlabeled(df_balanced, df_unlabeled, unlabeled_ratio=0.5)

# Create dataset that can be easily used by Tensorflow
x, y = to_dataset(df_combined, ['positive', 'neutral', 'negative', 'none'])

## Quick usage
Or use the `Preprocessor` class to quickly do the work.

In [None]:
from src.data import Preprocessor

preprocessor = Preprocessor()

# To dataframe
df_train, df_val = preprocessor.parse_train(
    'mams',
    'train.xml',
    unlabeled_data=[
        ('semeval14', 'train.xml')
    ],
    unlabeled_ratio=0.5,
    validation_split=0.2,
    create_dataset=False
)

# To dataset
x, y = preprocessor.parse_train(
    'mams',
    'train.xml',
    unlabeled_data=[
        ('semeval14', 'train.xml')
    ],
    unlabeled_ratio=0.5
)

# Test dataset
x, y = preprocessor.parse_test('mams', 'test.xml')