In [1]:
import os
import pandas as pd
import json
from collections import Counter
import re

# Hyperparameters
NUM_EXAMPLES = 5_000  # Choose between 5000, 20000, or 100000
LIMIT_VOCAB = True  # Set to True to limit the vocab size, False otherwise
VOCAB_COVERAGE = 0.9  # If limiting vocab, keep words that cover 90% of the word occurrences
TEST_SPLIT = 0.1  # Proportion of data for the test set

# Print hyperparameters
print(f"Number of Examples: {NUM_EXAMPLES}")
print(f"Limit Vocabulary: {LIMIT_VOCAB}")
print(f"Vocabulary Coverage: {VOCAB_COVERAGE * 100}%\n")

# Paths
path = '../raw/wiki-103-train.parquet'

# Load the dataframe
df = pd.read_parquet(path)

# If 'clean_text' column doesn't exist, clean and create it
def clean_text(text):
    # Remove special characters and digits, keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

# Apply text cleaning if 'clean_text' column doesn't exist
if 'clean_text' not in df.columns:
    print("Cleaning text...")
    df['clean_text'] = df['text'].apply(lambda x: clean_text(x) if isinstance(x, str) else '')

# Select the specified number of examples
df_reduced = df.sample(n=NUM_EXAMPLES, random_state=42)

# Tokenize the text and count word frequencies
def tokenize(text):
    if isinstance(text, str):
        return text.lower().split()
    return []  # Return an empty list for non-string values

print(f"Tokenizing {NUM_EXAMPLES} examples...")

# Count word occurrences
word_counter = Counter()
for text in df_reduced['clean_text']:
    words = tokenize(text)
    word_counter.update(words)

# Limit the vocabulary size dynamically based on cumulative coverage if LIMIT_VOCAB is True
if LIMIT_VOCAB:
    print(f"Limiting vocabulary based on {VOCAB_COVERAGE * 100}% coverage...")

    total_word_count = sum(word_counter.values())
    cumulative_count = 0
    limited_vocab = []

    # Sort words by frequency and calculate cumulative percentage
    for word, count in word_counter.most_common():
        cumulative_count += count
        limited_vocab.append(word)
        # Stop when we reach the cumulative coverage
        if cumulative_count / total_word_count >= VOCAB_COVERAGE:
            break

    print(f"Vocabulary size limited to {len(limited_vocab)} words.")
else:
    limited_vocab = list(word_counter.keys())

# Create word-to-index and index-to-word dictionaries
word_to_index = {word: idx + 1 for idx, word in enumerate(limited_vocab)}  # Start indexing from 1
word_to_index["<unk>"] = 0  # Unknown token for words outside the vocabulary
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Function to convert text to word indices
def text_to_indices(text, word_to_index):
    words = tokenize(text)
    return [word_to_index.get(word, word_to_index["<unk>"]) for word in words]

# Convert 'clean_text' to indexed sequences
print("Converting text to indexed sequences...\n")
df_reduced['indexed_text'] = df_reduced['clean_text'].apply(lambda x: text_to_indices(x, word_to_index))

# Create a directory for the output based on hyperparameters
vocab_size_str = f"{len(limited_vocab)}" if LIMIT_VOCAB else "all"
coverage_percent = int(VOCAB_COVERAGE * 100) if LIMIT_VOCAB else 100
dir_name = f"{NUM_EXAMPLES}_examples_{vocab_size_str}_vocab_{coverage_percent}_percent"
output_dir = f'../preprocessed/{dir_name}'
os.makedirs(output_dir, exist_ok=True)

# Split the dataset into train and test
train_size = int((1 - TEST_SPLIT) * len(df_reduced))
df_train = df_reduced[:train_size]
df_test = df_reduced[train_size:]

# Save train CSV
csv_train_path = f'{output_dir}/wiki-103_train.csv'
df_train[['clean_text', 'indexed_text']].to_csv(csv_train_path, index=False)
print(f"Train dataset saved to: {csv_train_path}")

# Save test CSV
csv_test_path = f'{output_dir}/wiki-103_test.csv'
df_test[['clean_text', 'indexed_text']].to_csv(csv_test_path, index=False)
print(f"Test dataset saved to: {csv_test_path}")

# Save word_to_index and index_to_word as JSON files
word_to_index_path = f'{output_dir}/word_to_index.json'
index_to_word_path = f'{output_dir}/index_to_word.json'

with open(word_to_index_path, 'w') as f:
    json.dump(word_to_index, f)

with open(index_to_word_path, 'w') as f:
    json.dump(index_to_word, f)

print(f"Word-to-index saved to: {word_to_index_path}")
print(f"Index-to-word saved to: {index_to_word_path}")


Number of Examples: 5000
Limit Vocabulary: True
Vocabulary Coverage: 90.0%

Cleaning text...
Tokenizing 5000 examples...
Limiting vocabulary based on 90.0% coverage...
Vocabulary size limited to 8093 words.
Converting text to indexed sequences...

Train dataset saved to: ../preprocessed/5000_examples_8093_vocab_90_percent/wiki-103_train.csv
Test dataset saved to: ../preprocessed/5000_examples_8093_vocab_90_percent/wiki-103_test.csv
Word-to-index saved to: ../preprocessed/5000_examples_8093_vocab_90_percent/word_to_index.json
Index-to-word saved to: ../preprocessed/5000_examples_8093_vocab_90_percent/index_to_word.json
