# Part 1: Data Preprocessing and Vocabulary Construction

## Installation Requirements

In [1]:
# Install required packages
!pip install numpy pandas matplotlib requests tqdm scikit-learn



## Dataset Download Functions

In [2]:
import os
import requests
from tqdm import tqdm

def download_file(url, filename):
    """Download file with progress bar"""
    if os.path.exists(filename):
        print(f"{filename} already exists. Skipping download.")
        return
    
    print(f"Downloading {filename}...")
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(filename, 'wb') as f, tqdm(
        desc=filename,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as pbar:
        for data in response.iter_content(chunk_size=1024):
            size = f.write(data)
            pbar.update(size)
    print(f"Downloaded {filename}")

# Download Wikipedia dataset (using smaller enwik8 for faster training)
def download_wikipedia():
    """Download Wikipedia enwik8 dataset"""
    url = "http://mattmahoney.net/dc/enwik8.zip"
    download_file(url, "enwik8.zip")
    
    # Unzip
    import zipfile
    with zipfile.ZipFile("enwik8.zip", 'r') as zip_ref:
        zip_ref.extractall(".")
    print("Wikipedia dataset extracted!")

# Download Linux Kernel Code
def download_linux_kernel():
    """Download Linux kernel source code dataset"""
    url = "https://cs.stanford.edu/people/karpathy/char-rnn/linux_input.txt"
    download_file(url, "linux_input.txt")
    print("Linux kernel dataset downloaded!")


## Execute Downloads

In [3]:
# Download both datasets
download_wikipedia()
download_linux_kernel()

enwik8.zip already exists. Skipping download.
Wikipedia dataset extracted!
linux_input.txt already exists. Skipping download.
Linux kernel dataset downloaded!


## Preprocessing Functions

In [4]:
import re
from collections import Counter
import numpy as np

def preprocess_text_natural_language(text, max_chars=10000000):
    """
    Preprocess natural language text (Wikipedia)
    Remove special characters except full stop
    """
    # Take only first max_chars to speed up processing
    text = text[:max_chars]
    
    # Remove special characters except alphanumeric, space, and full stop
    text = re.sub(r'[^a-zA-Z0-9 \.]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\.+', '.', text)
    
    return text

def preprocess_code(text, max_chars=10000000):
    """
    Preprocess code text (Linux Kernel)
    Keep special characters but split by newlines
    """
    # Take only first max_chars
    text = text[:max_chars]
    
    # Convert to lowercase for consistency
    text = text.lower()
    
    # Replace tabs with spaces
    text = text.replace('\t', ' ')
    
    # Replace multiple spaces with single space
    text = re.sub(' +', ' ', text)
    
    return text

def build_vocabulary(text, is_code=False):
    """
    Build vocabulary from preprocessed text
    Returns: word_to_idx, idx_to_word, word_counts
    """
    if is_code:
        # Split by newlines for code, treat each line as a sentence
        lines = text.split('\n')
        # Split each line into words
        words = []
        for line in lines:
            line = line.strip()
            if line:  # Skip empty lines
                words.extend(line.split())
                words.append('<EOS>')  # End of statement marker
    else:
        # Split by periods for natural language
        sentences = text.split('.')
        words = []
        for sentence in sentences:
            sentence = sentence.strip()
            if sentence:
                words.extend(sentence.split())
                words.append('<EOS>')  # End of sentence marker
    
    # Count word frequencies
    word_counts = Counter(words)
    
    # Create vocabulary (keep all unique words)
    unique_words = ['<PAD>', '<UNK>', '<EOS>'] + sorted(word_counts.keys())
    
    # Remove duplicates
    unique_words = list(dict.fromkeys(unique_words))
    
    word_to_idx = {word: idx for idx, word in enumerate(unique_words)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    
    return word_to_idx, idx_to_word, word_counts

def create_training_data(text, word_to_idx, context_length=5, is_code=False):
    """
    Create X, y pairs for training
    X: context_length previous words
    y: next word to predict
    """
    if is_code:
        lines = text.split('\n')
        words = []
        for line in lines:
            line = line.strip()
            if line:
                words.extend(line.split())
                words.append('<EOS>')
    else:
        sentences = text.split('.')
        words = []
        for sentence in sentences:
            sentence = sentence.strip()
            if sentence:
                words.extend(sentence.split())
                words.append('<EOS>')
    
    # Convert words to indices
    word_indices = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in words]
    
    X = []
    y = []
    
    # Create context windows
    for i in range(len(word_indices) - context_length):
        context = word_indices[i:i + context_length]
        target = word_indices[i + context_length]
        X.append(context)
        y.append(target)
    
    return np.array(X, dtype=np.int64), np.array(y, dtype=np.int64)


## Process Wikipedia Dataset

In [5]:
print("=" * 80)
print("PROCESSING WIKIPEDIA DATASET (Category I: Natural Language)")
print("=" * 80)

# Read the file
with open('enwik8', 'rb') as f:
    raw_text = f.read().decode('utf-8', errors='ignore')

# Preprocess
print("\nPreprocessing text...")
wiki_text = preprocess_text_natural_language(raw_text, max_chars=5000000)  # Use 5MB for faster processing

# Build vocabulary
print("Building vocabulary...")
wiki_word_to_idx, wiki_idx_to_word, wiki_word_counts = build_vocabulary(wiki_text, is_code=False)

# Print statistics
print(f"\nüìä WIKIPEDIA DATASET STATISTICS:")
print(f"   Vocabulary size: {len(wiki_word_to_idx)}")
print(f"   Total words in corpus: {sum(wiki_word_counts.values())}")

# Get 10 most frequent words (excluding special tokens)
most_common = wiki_word_counts.most_common(13)  # Get extra to exclude special tokens
most_common_filtered = [w for w in most_common if w[0] not in ['<PAD>', '<UNK>', '<EOS>']][:10]
print(f"\n   üîù 10 Most Frequent Words:")
for i, (word, count) in enumerate(most_common_filtered, 1):
    print(f"      {i}. '{word}': {count} occurrences")

# Get 10 least frequent words
least_common = wiki_word_counts.most_common()[-10:]
print(f"\n   üîª 10 Least Frequent Words:")
for i, (word, count) in enumerate(least_common, 1):
    print(f"      {i}. '{word}': {count} occurrences")

# Create training data with context length 5
print("\n\nCreating training data (context_length=5)...")
wiki_X, wiki_y = create_training_data(wiki_text, wiki_word_to_idx, context_length=5, is_code=False)

print(f"   Training samples created: {len(wiki_X)}")
print(f"   X shape: {wiki_X.shape}")
print(f"   y shape: {wiki_y.shape}")

# Show example (FIXED)
print(f"\n   Example training pair:")
context_words = [wiki_idx_to_word[int(idx)] for idx in wiki_X[100].tolist()]
target_word = wiki_idx_to_word[int(wiki_y[100])]
print(f"   Context (X): {context_words}")
print(f"   Target (y):  {target_word}")

# Save processed data
import pickle

wiki_data = {
    'X': wiki_X,
    'y': wiki_y,
    'word_to_idx': wiki_word_to_idx,
    'idx_to_word': wiki_idx_to_word,
    'word_counts': wiki_word_counts,
    'vocab_size': len(wiki_word_to_idx),
    'text_sample': wiki_text[:10000]  # Save a sample for reference
}

with open('wikipedia_processed.pkl', 'wb') as f:
    pickle.dump(wiki_data, f)

print("\n‚úÖ Wikipedia data saved to 'wikipedia_processed.pkl'")


PROCESSING WIKIPEDIA DATASET (Category I: Natural Language)

Preprocessing text...
Building vocabulary...

üìä WIKIPEDIA DATASET STATISTICS:
   Vocabulary size: 78683
   Total words in corpus: 674778

   üîù 10 Most Frequent Words:
      1. 'the': 38601 occurrences
      2. 'of': 24013 occurrences
      3. 'and': 16622 occurrences
      4. 'in': 14576 occurrences
      5. 'to': 11893 occurrences
      6. 'a': 11694 occurrences
      7. 'is': 6792 occurrences
      8. 'as': 5091 occurrences
      9. 'for': 4375 occurrences
      10. 'that': 4347 occurrences

   üîª 10 Least Frequent Words:
      1. 'id1116id': 1 occurrences
      2. 'id15899620id': 1 occurrences
      3. 'timestamp20020909t020201ztimestamp': 1 occurrences
      4. 'commentmerge': 1 occurrences
      5. 'samoatext': 1 occurrences
      6. 'titleaustraliageographytitle': 1 occurrences
      7. 'id1117id': 1 occurrences
      8. 'id15899621id': 1 occurrences
      9. 'timestamp20020804t102336ztimestamp': 1 occurrences
 

## Process Linux Kernel Dataset

In [6]:
print("\n" + "=" * 80)
print("PROCESSING LINUX KERNEL DATASET (Category II: Structured Code)")
print("=" * 80)

# Read the file
with open('linux_input.txt', 'r', encoding='utf-8', errors='ignore') as f:
    raw_code = f.read()

# Preprocess
print("\nPreprocessing code...")
linux_text = preprocess_code(raw_code, max_chars=5000000)  # Use 5MB for faster processing

# Build vocabulary
print("Building vocabulary...")
linux_word_to_idx, linux_idx_to_word, linux_word_counts = build_vocabulary(linux_text, is_code=True)

# Print statistics
print(f"\nüìä LINUX KERNEL DATASET STATISTICS:")
print(f"   Vocabulary size: {len(linux_word_to_idx)}")
print(f"   Total words in corpus: {sum(linux_word_counts.values())}")

# Get 10 most frequent words
most_common = linux_word_counts.most_common(13)
most_common_filtered = [w for w in most_common if w not in ['<PAD>', '<UNK>', '<EOS>']][:10]
print(f"\n   üîù 10 Most Frequent Words:")
for i, (word, count) in enumerate(most_common_filtered, 1):
    print(f"      {i}. '{word}': {count} occurrences")

# Get 10 least frequent words
least_common = linux_word_counts.most_common()[-10:]
print(f"\n   üîª 10 Least Frequent Words:")
for i, (word, count) in enumerate(least_common, 1):
    print(f"      {i}. '{word}': {count} occurrences")

# Create training data
print("\n\nCreating training data (context_length=5)...")
linux_X, linux_y = create_training_data(linux_text, linux_word_to_idx, context_length=5, is_code=True)

print(f"   Training samples created: {len(linux_X)}")
print(f"   X shape: {linux_X.shape}")
print(f"   y shape: {linux_y.shape}")

# Show example
print(f"\n   Example training pair:")
context_words = [linux_idx_to_word[int(idx)] for idx in linux_X[100].tolist()]
target_word = linux_idx_to_word[int(linux_y[100])]
print(f"   Context (X): {context_words}")
print(f"   Target (y):  {target_word}")


# Save processed data
linux_data = {
    'X': linux_X,
    'y': linux_y,
    'word_to_idx': linux_word_to_idx,
    'idx_to_word': linux_idx_to_word,
    'word_counts': linux_word_counts,
    'vocab_size': len(linux_word_to_idx),
    'text_sample': linux_text[:10000]
}

with open('linux_processed.pkl', 'wb') as f:
    pickle.dump(linux_data, f)

print("\n‚úÖ Linux kernel data saved to 'linux_processed.pkl'")



PROCESSING LINUX KERNEL DATASET (Category II: Structured Code)

Preprocessing code...
Building vocabulary...

üìä LINUX KERNEL DATASET STATISTICS:
   Vocabulary size: 90882
   Total words in corpus: 779608

   üîù 10 Most Frequent Words:
      1. '<EOS>': 165787 occurrences
      2. '*': 28256 occurrences
      3. '=': 22173 occurrences
      4. 'if': 15275 occurrences
      5. 'the': 14721 occurrences
      6. '{': 14702 occurrences
      7. '}': 13567 occurrences
      8. '*/': 10962 occurrences
      9. '/*': 9882 occurrences
      10. 'struct': 8870 occurrences

   üîª 10 Least Frequent Words:
      1. 'buffer_b;': 1 occurrences
      2. 'out_dec:': 1 occurrences
      3. 'atomic_dec(&cpu_buffer_a->record_disabled);': 1 occurrences
      4. 'atomic_dec(&cpu_buffer_b->record_disabled);': 1 occurrences
      5. 'export_symbol_gpl(ring_buffer_swap_cpu);': 1 occurrences
      6. 'ring_buffer_alloc_read_page': 1 occurrences
      7. 'ring_buffer_read_page.': 1 occurrences
      8. '

## Verification

In [7]:
print("\n" + "=" * 80)
print("DATA PREPROCESSING COMPLETE!")
print("=" * 80)
print("\nüì¶ Files created:")
print("   1. wikipedia_processed.pkl")
print("   2. linux_processed.pkl")
print("\n‚ú® Ready for model training!")


DATA PREPROCESSING COMPLETE!

üì¶ Files created:
   1. wikipedia_processed.pkl
   2. linux_processed.pkl

‚ú® Ready for model training!
