In [None]:
import os.path
from typing import *
import datasets
from datasets import Dataset
from tqdm.auto import tqdm
import project_paths as pp

In [None]:
def build_IMDB_sentiment_analysis_dataset(dataset_type: Literal['train', 'test'] = 'train') -> int:
    '''Build a Hugging Face Dataset from the IMDB sentiment analysis dataset.

    This function reads the IMDB movie review files from disk and creates a Hugging Face Dataset
    with 'text' and 'label' columns. The dataset is then saved to disk.

    Args:
        dataset_type: Either 'train' or 'test' to specify which dataset split to build

    Returns:
        int: The number of examples in the created dataset

    Raises:
        ValueError: If dataset_type is not 'train' or 'test'
    '''
    # This function processes the IMDB sentiment analysis dataset which consists of movie reviews
    # labeled as either positive ('pos') or negative ('neg') sentiment. The reviews are stored
    # as individual text files in separate folders for each label.
    
    # Input validation
    if dataset_type == 'train':
        dataset_type_folder_path = os.path.join(pp.aclImdb_dataset_folder_path, 'train')
    elif dataset_type == 'test':
        dataset_type_folder_path = os.path.join(pp.aclImdb_dataset_folder_path, 'test')
    else:
        raise ValueError('Invalid value encountered for "dataset_type"')

    label_and_respective_folder_path = {
        'pos': os.path.join(dataset_type_folder_path, 'pos'),
        'neg': os.path.join(dataset_type_folder_path, 'neg')
    }
    dataset = {
        'text': [],
        'label': []
    }

    for label, folder_path in label_and_respective_folder_path.items():
        item_names = os.listdir(folder_path)
        progress_bar = tqdm(total=len(item_names), dynamic_ncols=True)
        for idx, item_name in enumerate(item_names):
            item_file_path = os.path.join(folder_path, item_name)
            if os.path.splitext(item_file_path)[1] == '.txt':
                with open(item_file_path, encoding='utf-8') as file:
                    dataset['text'].append(file.read())
                    dataset['label'].append(label)
            progress_bar.update(1)
        progress_bar.close()

    dataset = Dataset.from_dict(dataset)
    dataset.save_to_disk(dataset_type_folder_path)

    return len(dataset)

In [None]:
build_IMDB_sentiment_analysis_dataset(dataset_type='train')
build_IMDB_sentiment_analysis_dataset(dataset_type='test')