In [2]:
import numpy as np
import pandas as pd
import pyarrow

In [5]:
df = pd.read_parquet('train-00000-of-00001.parquet')

In [6]:
# Remove empty or NaN rows
df_cleaned = df[df['text'].notnull() & (df['text'] != '')]

In [7]:
df_cleaned.shape

(23767, 1)

In [10]:
import multiprocessing as mp
import string
import spacy
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline

In [11]:
nlp = spacy.load("en_core_web_sm")

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, nlp=nlp, n_jobs=1, verbose=True):
        """
        Text preprocessing transformer includes steps:
            1. Punctuation removal
            2. Stop words removal
            3. Lemmatization

        nlp  - spacy model
        n_jobs - parallel jobs to run
        verbose - if True, it will print progress
        """
        self.nlp = nlp
        self.n_jobs = n_jobs
        self.verbose = verbose

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        # Initialize a counter for tracking processed rows
        processed_rows = 0
        total_rows = len(part)

        # Apply preprocessing with tracking
        result = part.apply(lambda x: self._process_and_track(x, processed_rows, total_rows))

        return result

    def _process_and_track(self, text, processed_rows, total_rows):
        processed_rows += 1
        # if self.verbose:
        #     print(f"Processed {processed_rows} / {total_rows} rows")
        return self._preprocess_text(text)

    def _preprocess_text(self, text):
        doc = self.nlp(text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _remove_punct(self, doc):
        return (t for t in doc if t.text not in string.punctuation)

    def _remove_stop_words(self, doc):
        return (t for t in doc if not t.is_stop)

    def _lemmatize(self, doc):
        return ' '.join(t.lemma_ for t in doc)

# Example usage
text_preprocessor = Pipeline(steps=[
    ('normalize', TextPreprocessor(n_jobs=-1, verbose=True))
])

# Apply to only the first 5 rows
df_cleaned['Text_After_Clean'] = text_preprocessor.fit_transform(df_cleaned['text'])

  return bound(*args, **kwds)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Text_After_Clean'] = text_preprocessor.fit_transform(df_cleaned['text'])


In [12]:
df_cleaned['Text_After_Clean'].shape

(23767,)

In [13]:
import collections

# Tokenize the text column
df_cleaned['Tokenized_Text'] = df_cleaned['Text_After_Clean'].apply(lambda x: x.split())

# Flatten the list of tokens into one big list
all_words = [word for tokens in df_cleaned['Tokenized_Text'] for word in tokens]

# Create a vocabulary (set of unique words)
vocabulary = set(all_words)

# Or create a frequency dictionary (word counts)
word_freq = collections.Counter(all_words)

# Print vocabulary size and most common words (optional)
print(f"Vocabulary size: {len(vocabulary)}")
print("Most common words:", word_freq.most_common(10))

Vocabulary size: 65247
Most common words: [('@-@', 16906), ('–', 3934), ('@.@', 3194), ('time', 2984), ('year', 2940), ('@,@', 2699), ('include', 2671), ('game', 2515), ('1', 2257), ('work', 1997)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Tokenized_Text'] = df_cleaned['Text_After_Clean'].apply(lambda x: x.split())


In [14]:
import re

# Define a function to filter unwanted tokens
def is_valid_token(token):
    # Exclude tokens with special characters, numbers, and single characters
    if re.search(r'[^a-zA-Z]', token):
        return False
    if len(token) <= 1:
        return False
    return True

# Clean the tokenized text
df_cleaned['Cleaned_Tokens'] = df_cleaned['Tokenized_Text'].apply(lambda tokens: [t for t in tokens if is_valid_token(t)])

# Flatten the cleaned list of tokens
cleaned_words = [word for tokens in df_cleaned['Cleaned_Tokens'] for word in tokens]

# Recalculate the vocabulary and word frequency
cleaned_vocabulary = set(cleaned_words)
cleaned_word_freq = collections.Counter(cleaned_words)

# Print the cleaned vocabulary size and most common words
print(f"Cleaned vocabulary size: {len(cleaned_vocabulary)}")
print("Most common cleaned words:", cleaned_word_freq.most_common(10))

Cleaned vocabulary size: 59419
Most common cleaned words: [('time', 2984), ('year', 2940), ('include', 2671), ('game', 2515), ('work', 1997), ('song', 1995), ('play', 1910), ('later', 1910), ('write', 1857), ('season', 1808)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Cleaned_Tokens'] = df_cleaned['Tokenized_Text'].apply(lambda tokens: [t for t in tokens if is_valid_token(t)])


In [15]:
df_cleaned.head()

Unnamed: 0,text,Text_After_Clean,Tokenized_Text,Cleaned_Tokens
1,= Valkyria Chronicles III = \n,Valkyria Chronicles III \n,"[Valkyria, Chronicles, III]","[Valkyria, Chronicles, III]"
3,Senjō no Valkyria 3 : Unrecorded Chronicles (...,Senjō Valkyria 3 Unrecorded Chronicles japan...,"[Senjō, Valkyria, 3, Unrecorded, Chronicles, j...","[Valkyria, Unrecorded, Chronicles, japanese, l..."
4,"The game began development in 2010 , carrying...",game begin development 2010 carry large port...,"[game, begin, development, 2010, carry, large,...","[game, begin, development, carry, large, porti..."
5,"It met with positive sales in Japan , and was...",meet positive sale Japan praise japanese wes...,"[meet, positive, sale, Japan, praise, japanese...","[meet, positive, sale, Japan, praise, japanese..."
7,= = Gameplay = = \n,Gameplay \n,[Gameplay],[Gameplay]


In [16]:
from collections import defaultdict

# Build vocabulary and word-to-index mapping
vocab = defaultdict(int)
for tokens in df_cleaned['Cleaned_Tokens']:
    for word in tokens:
        vocab[word] += 1

# Sort vocabulary by frequency and create word-to-index mapping
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
word_to_idx = {word: idx for idx, (word, _) in enumerate(sorted_vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

vocab_size = len(word_to_idx)
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 59419


In [17]:
# import pandas as pd

# Save the word_to_idx dictionary as a DataFrame and then to a CSV file
word_to_idx_df = pd.DataFrame(list(word_to_idx.items()), columns=['word', 'index'])
word_to_idx_df.to_csv('vocabulary_main.csv', index=False)

In [18]:
import multiprocessing as mp

# Define a function to generate skip-gram pairs for a single token list
def generate_skipgram_pairs(tokens, window_size=5):
    pairs = []
    for i, center_word in enumerate(tokens):
        context_window = tokens[max(i - window_size, 0): i] + tokens[i + 1: i + window_size + 1]
        for context_word in context_window:
            # Use get() to handle any out-of-vocabulary words gracefully
            center_idx = word_to_idx.get(center_word)
            context_idx = word_to_idx.get(context_word)
            if center_idx is not None and context_idx is not None:
                pairs.append((center_idx, context_idx))
    return pairs

# Token list
token_list = df_cleaned['Cleaned_Tokens'].tolist()

# Use multiprocessing Pool to parallelize the task
with mp.Pool(mp.cpu_count()) as pool:
    skipgram_pairs_list = pool.map(generate_skipgram_pairs, token_list)

# Flatten the list of lists
skipgram_pairs = [pair for sublist in skipgram_pairs_list for pair in sublist]

print(f"Number of Skip-Gram pairs: {len(skipgram_pairs)}")

Number of Skip-Gram pairs: 8269070


In [19]:
import pandas as pd

# Convert the list of pairs into a DataFrame
skipgram_df = pd.DataFrame(skipgram_pairs, columns=['Center_Word_Index', 'Context_Word_Index'])

# Save to CSV
skipgram_df.to_csv('pairs_main.csv', index=False)

In [29]:
import multiprocessing as mp

# Define a function to generate CBOW pairs with fixed context size
def generate_cbow_pairs_fixed_length(tokens, window_size=5):
    pairs = []
    
    # Flatten the list of token lists to treat all tokens as one continuous sequence
    all_tokens = [token for tokens in tokens for token in tokens]
    
    for i, center_word in enumerate(all_tokens):
        # Get the left and right context, ensuring we have exactly window_size
        left_context = all_tokens[max(i - window_size, 0): i]
        right_context = all_tokens[i + 1: i + window_size + 1]

        # Combine left and right context, ensuring total length is fixed
        context_window = left_context[-window_size:] + right_context[:window_size]

        # Ensure the center word is in the vocabulary
        center_idx = word_to_idx.get(center_word)
        if center_idx is not None:
            # Convert context words to indices, filtering out any None values
            context_indices = [word_to_idx.get(word) for word in context_window if word_to_idx.get(word) is not None]

            # If there are fewer context words than expected, pad with a placeholder
            while len(context_indices) < 2 * window_size:
                context_indices.append(word_to_idx.get('<PAD>', 0))  # Using 0 as index for <PAD> if not found

            # Create CBOW pair: (context, center word)
            pairs.append((context_indices[:2 * window_size], center_idx))  # Ensure fixed size of 10 (2 * window_size)
    
    return pairs

# Token list (list of lists)
token_lists = df_cleaned['Cleaned_Tokens'].tolist()

# Use multiprocessing Pool to parallelize the task
with mp.Pool(mp.cpu_count()) as pool:
    cbow_pairs_list = pool.map(generate_cbow_pairs_fixed_length, [token_lists])

# Flatten the list of lists
cbow_pairs = [pair for sublist in cbow_pairs_list for pair in sublist]

print(f"Number of CBOW pairs: {len(cbow_pairs)}")


Number of CBOW pairs: 887772


In [30]:
cbow_pairs

[([3561, 722, 3002, 33806, 3561, 0, 0, 0, 0, 0], 3002),
 ([3002, 722, 3002, 33806, 3561, 782, 0, 0, 0, 0], 3561),
 ([3002, 3561, 3002, 33806, 3561, 782, 308, 0, 0, 0], 722),
 ([3002, 3561, 722, 33806, 3561, 782, 308, 3002, 0, 0], 3002),
 ([3002, 3561, 722, 3002, 3561, 782, 308, 3002, 17582, 0], 33806),
 ([3002, 3561, 722, 3002, 33806, 782, 308, 3002, 17582, 1446], 3561),
 ([3561, 722, 3002, 33806, 3561, 308, 3002, 17582, 1446, 356], 782),
 ([722, 3002, 33806, 3561, 782, 3002, 17582, 1446, 356, 3002], 308),
 ([3002, 33806, 3561, 782, 308, 17582, 1446, 356, 3002, 3561], 3002),
 ([33806, 3561, 782, 308, 3002, 1446, 356, 3002, 3561, 722], 17582),
 ([3561, 782, 308, 3002, 17582, 356, 3002, 3561, 722, 453], 1446),
 ([782, 308, 3002, 17582, 1446, 3002, 3561, 722, 453, 749], 356),
 ([308, 3002, 17582, 1446, 356, 3561, 722, 453, 749, 5266], 3002),
 ([3002, 17582, 1446, 356, 3002, 722, 453, 749, 5266, 116], 3561),
 ([17582, 1446, 356, 3002, 3561, 453, 749, 5266, 116, 6], 722),
 ([1446, 356, 3002

In [31]:
df_cbow_pairs = pd.DataFrame(cbow_pairs, columns=['Context', 'Center_Word'])

# Save the DataFrame to a CSV file
csv_file_path = 'cbow_pairs.csv'
df_cbow_pairs.to_csv(csv_file_path, index=False)

print(f"CBOW pairs saved to {csv_file_path}")

CBOW pairs saved to cbow_pairs.csv
