In [None]:
def print_allowed_characters(results):
    """Print allowed characters for all datasets in a clear format."""
    print("\n=== ALLOWED CHARACTERS SUMMARY ===")
    for dataset_name, data in results.items():
        config = data['config']
        
        print(f"\n{dataset_name.upper()}:")
        print(f"  Letters ({len(config['allowed_letters'])}): {config['allowed_letters']}")
        print(f"  Digits ({len(config['allowed_digits'])}): {config['allowed_digits']}")
        print(f"  Symbols ({len(config['allowed_symbols'])}): {config['allowed_symbols']}")
        print(f"  Total allowed: {config['total_whitelist_chars']} characters")

print_allowed_characters(results)


=== ALLOWED CHARACTERS SUMMARY ===

SIMPLE_WIKIPEDIA:
  Letters (52): abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
  Digits (10): 0123456789
  Symbols (35): !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

  Total allowed: 97 characters

THE_ALGORITHMS_CODE:
  Letters (52): abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
  Digits (10): 0123456789
  Symbols (35): !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

  Total allowed: 97 characters

GUTENBERG:
  Letters (52): abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
  Digits (10): 0123456789
  Symbols (35): !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

  Total allowed: 97 characters

CARTIGRATIS:
  Letters (62): abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZăâîșțĂÂÎȘȚ
  Digits (10): 0123456789
  Symbols (35): !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

  Total allowed: 107 characters

NEWSGROUP:
  Letters (52): abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
  Digits (10): 0123456789
  Symbols (35): !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

  Total allowed

In [None]:
import os
import pickle
import json
import string
import re
from collections import Counter, defaultdict
import sys
sys.path.append('..')
from config.config import TOP_N_WORDS

def is_valid_word(word, allowed_letters, min_length=2):
    """Check if a word is valid (contains only letters and meets minimum length)"""
    if len(word) < min_length:
        return False
    
    # Word should contain only allowed letters
    return all(char in allowed_letters for char in word)

def extract_words_from_text(text, allowed_letters):
    """Extract valid words from text using regex and validation"""
    # Create character class for regex from allowed letters
    # Escape special regex characters
    escaped_letters = re.escape(''.join(sorted(set(allowed_letters))))
    
    # Find sequences of allowed letters
    word_pattern = f'[{escaped_letters}]+'
    
    # Extract all letter sequences
    potential_words = re.findall(word_pattern, text, re.IGNORECASE)
    
    # Filter for valid words
    valid_words = []
    for word in potential_words:
        if is_valid_word(word, allowed_letters):
            valid_words.append(word.lower())  # Normalize to lowercase
    
    return valid_words

def calculate_remaining_character_pool(results, top_n_words=1000):
    """Calculate remaining character pool after typing top N words"""
    updated_results = {}
    
    for dataset_name, dataset_result in results.items():
        # Get the original character counter
        original_char_counter = Counter()
        for char, freq_data in dataset_result['character_frequencies'].items():
            original_char_counter[char] = freq_data['absolute']
        
        # Get top N words
        top_words = dataset_result['word_frequencies'][:top_n_words]
        
        # Calculate characters used in typing top N words
        characters_used = Counter()
        for word_data in top_words:
            word = word_data['word']
            word_count = word_data['absolute']  # How many times this word appears
            for char in word:
                characters_used[char] += word_count  # Each character typed as many times as the word appears
        
        # Calculate remaining character pool
        remaining_pool = Counter()
        for char, original_count in original_char_counter.items():
            used_count = characters_used.get(char, 0)
            remaining_count = original_count - used_count
            if remaining_count > 0:  # Only keep characters that still have occurrences
                remaining_pool[char] = remaining_count
        
        # Calculate new frequencies based on remaining pool
        total_remaining_chars = sum(remaining_pool.values())
        remaining_char_frequencies = {}
        
        for char, count in remaining_pool.items():
            relative_freq = count / total_remaining_chars if total_remaining_chars > 0 else 0
            remaining_char_frequencies[char] = {
                'absolute': count,
                'relative': relative_freq
            }
        
        # Create updated result with remaining pool
        updated_dataset_result = dataset_result.copy()
        updated_dataset_result[f'{top_n_words}_remaining_characters_pool'] = {
            'remaining_pool': dict(remaining_pool),
            'remaining_frequencies': remaining_char_frequencies,
            'total_remaining_characters': total_remaining_chars,
            'characters_used_in_top_words': dict(characters_used),
            'original_total_characters': dataset_result['stats']['total_characters'],
            'characters_subtracted': sum(characters_used.values())
        }
        
        updated_results[dataset_name] = updated_dataset_result
    
    return updated_results

def main_processing():
    """Main processing function - analyzes character and word frequencies in datasets."""
    
    # Configuration
    root_dir = '../../../data/text/raw'
    output_dir = '../../../data/text/processed'
    os.makedirs(output_dir, exist_ok=True)
    
    print("Starting dataset processing...")
    
    # Discover datasets
    if not os.path.exists(root_dir):
        print(f"ERROR: Root directory does not exist: {root_dir}")
        return
    
    datasets = [item for item in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, item))]
    print(f"Found {len(datasets)} datasets: {datasets}")
    
    # Process each dataset
    results = {}
    for dataset_name in datasets:
        # Define character sets based on dataset
        if dataset_name.lower() == 'cartigratis':
            allowed_letters = string.ascii_letters + "ăâîșțĂÂÎȘȚ"
        else:
            allowed_letters = string.ascii_letters
        
        allowed_digits = string.digits
        allowed_symbols = string.punctuation + " \t\n"
        
        # Initialize counters
        char_counter = Counter()
        category_counters = {
            'letters': Counter(),
            'digits': Counter(),
            'symbols': Counter()
        }
        word_counter = Counter()
        
        total_chars = 0
        total_words = 0
        
        # Process all files in the dataset
        dataset_path = os.path.join(root_dir, dataset_name)
        
        # Walk through all files in the dataset directory
        for root, dirs, files in os.walk(dataset_path):
            for file in files:
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        
                        # Process characters (unchanged)
                        for char in content:
                            total_chars += 1
                            
                            if char in allowed_letters:
                                char_counter[char] += 1
                                category_counters['letters'][char] += 1
                            elif char in allowed_digits:
                                char_counter[char] += 1
                                category_counters['digits'][char] += 1
                            elif char in allowed_symbols:
                                char_counter[char] += 1
                                category_counters['symbols'][char] += 1
                        
                        # Extract valid words using improved method
                        valid_words = extract_words_from_text(content, allowed_letters)
                        
                        # Count words
                        for word in valid_words:
                            word_counter[word] += 1
                            total_words += 1
                
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
        
        # Calculate character frequencies (unchanged)
        char_frequencies = {}
        for char, count in char_counter.items():
            relative_freq = count / total_chars if total_chars > 0 else 0
            char_frequencies[char] = {
                'absolute': count,
                'relative': relative_freq
            }
        
        # Calculate category-specific frequencies (unchanged)
        category_frequencies = {}
        for category, counter in category_counters.items():
            category_frequencies[category] = {}
            category_total = sum(counter.values())
            for char, count in counter.items():
                relative_freq = count / category_total if category_total > 0 else 0
                category_frequencies[category][char] = {
                    'absolute': count,
                    'relative': relative_freq,
                    'category_relative': relative_freq
                }
        
        # Convert word counter to sorted list by frequency
        sorted_words = []
        for word, count in word_counter.most_common():
            relative_freq = count / total_words if total_words > 0 else 0
            sorted_words.append({
                'word': word,
                'absolute': count,
                'relative': relative_freq,
                'percentage': relative_freq * 100
            })
        
        # Prepare result for this dataset
        result = {
            'config': {
                'allowed_letters': allowed_letters,
                'allowed_digits': allowed_digits,
                'allowed_symbols': allowed_symbols,
                'min_word_length': 2
            },
            'stats': {
                'total_characters': total_chars,
                'unique_characters': len(char_counter),
                'total_words': total_words,
                'unique_words': len(word_counter)
            },
            'character_frequencies': char_frequencies,
            'category_frequencies': category_frequencies,
            'word_frequencies': sorted_words
        }
        
        results[dataset_name] = result
        print(f"Processed {dataset_name}: {total_chars} chars, {len(word_counter)} unique words")
        
        # Show sample of top words for debugging
        print(f"  Top 10 words: {[w['word'] for w in sorted_words[:10]]}")
    
    # Calculate remaining character pool after typing top N words
    print(f"\nCalculating remaining character pool for top {TOP_N_WORDS} words...")
    results_with_remaining_pool = calculate_remaining_character_pool(results, TOP_N_WORDS)
    
    # Save results
    pickle_file = os.path.join(output_dir, 'frequency_analysis.pkl')
    with open(pickle_file, 'wb') as f:
        pickle.dump(results_with_remaining_pool, f)
    
    json_file = os.path.join(output_dir, 'frequency_analysis.json')
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(results_with_remaining_pool, f, ensure_ascii=False, indent=2)
    
    print(f"\nProcessing complete. Results saved to:")
    print(f"  Pickle: {pickle_file}")
    print(f"  JSON: {json_file}")
    
    return results_with_remaining_pool

# Additional utility function for analysis
def analyze_word_patterns(results, dataset_name):
    """Analyze patterns in the word data for debugging"""
    if dataset_name not in results:
        print(f"Dataset {dataset_name} not found")
        return
    
    words = results[dataset_name]['word_frequencies']
    
    print(f"\nAnalysis for {dataset_name}:")
    print(f"Total unique words: {len(words)}")
    
    # Length distribution
    length_dist = Counter()
    for word_data in words:
        length_dist[len(word_data['word'])] += 1
    
    print("Word length distribution:")
    for length in sorted(length_dist.keys()):
        print(f"  Length {length}: {length_dist[length]} words")
    
    # Show words with unusual patterns
    suspicious_words = []
    for word_data in words[:50]:  # Check top 50
        word = word_data['word']
        if any(char.isdigit() for char in word) or len(word) == 1:
            suspicious_words.append(word)
    
    if suspicious_words:
        print(f"Suspicious words found: {suspicious_words}")
    else:
        print("No suspicious words in top 50")

# Function to load and analyze existing pickle file
def load_and_update_pickle(pickle_path=None):
    """Load existing pickle file and update with remaining character pool"""
    if pickle_path is None:
        pickle_path = '../../../data/text/processed/frequency_analysis.pkl'
    
    if not os.path.exists(pickle_path):
        print(f"ERROR: Pickle file does not exist: {pickle_path}")
        return None
    
    # Load existing results
    with open(pickle_path, 'rb') as f:
        results = pickle.load(f)
    
    print(f"Loaded existing results with {len(results)} datasets")
    
    # Update with remaining character pool
    print(f"Calculating remaining character pool for top {TOP_N_WORDS} words...")
    updated_results = calculate_remaining_character_pool(results, TOP_N_WORDS)
    
    # Save updated results back to file
    output_dir = os.path.dirname(pickle_path)
    updated_pickle_file = os.path.join(output_dir, f'frequency_analysis_updated_top_{TOP_N_WORDS}.pkl')
    updated_json_file = os.path.join(output_dir, f'frequency_analysis_updated_top_{TOP_N_WORDS}.json')
    
    with open(updated_pickle_file, 'wb') as f:
        pickle.dump(updated_results, f)
    
    with open(updated_json_file, 'w', encoding='utf-8') as f:
        json.dump(updated_results, f, ensure_ascii=False, indent=2)
    
    print(f"Updated results saved to:")
    print(f"  Pickle: {updated_pickle_file}")
    print(f"  JSON: {updated_json_file}")
    
    return updated_results

# Execute processing
if __name__ == "__main__":
    results = main_processing()
    
    # Analyze Romanian data specifically
    if 'cartigratis' in results:
        analyze_word_patterns(results, 'cartigratis')
    
    # Example of accessing the new remaining character pool data
    for dataset_name, dataset_result in results.items():
        if f'{TOP_N_WORDS}_remaining_characters_pool' in dataset_result:
            remaining_data = dataset_result[f'{TOP_N_WORDS}_remaining_characters_pool']
            print(f"\n{dataset_name} - Remaining character pool after typing top {TOP_N_WORDS} words:")
            print(f"  Total remaining characters: {remaining_data['total_remaining_characters']}")
            print(f"  Characters subtracted: {remaining_data['characters_subtracted']}")
            print(f"  Original total: {remaining_data['original_total_characters']}")

ModuleNotFoundError: No module named 'config'

In [None]:
import pickle
import pandas as pd

# Load the pickle file
with open('./processed/frequency_analysis.pkl', 'rb') as f:
    results = pickle.load(f)

# Get the maximum number of words across all datasets
max_words = max(len(data['word_frequencies']) for data in results.values()) if results else 0

# Create a combined table with top 1000 words
all_datasets = list(results.keys())
top_n = min(1000, max_words)  # Use minimum of 1000 or max words available

# Create DataFrame with rank and words from each dataset
table_data = {'Rank': range(1, top_n + 1)}

for dataset_name in all_datasets:
    words = results[dataset_name]['word_frequencies']
    dataset_words = []
    for i in range(top_n):
        if i < len(words):
            dataset_words.append(words[i]['word'])
        else:
            dataset_words.append('')  # Empty if no word at this rank
    table_data[dataset_name] = dataset_words

# Create the DataFrame
df = pd.DataFrame(table_data)

# Display the table
print(f"Top {top_n} words comparison across datasets")
print(f"Datasets: {', '.join(all_datasets)}")
print("="*80)

# Print the table
print(df.to_string(index=False, max_rows=1000, max_colwidth=30))

Top 1000 words comparison across datasets
Datasets: simple_wikipedia, the_algorithms_code, gutenberg, cartigratis, newsgroup
 Rank simple_wikipedia      the_algorithms_code     gutenberg   cartigratis     newsgroup
    1              the                      the           the            de           the
    2               of                       if           and            în            to
    3               in                      int            of            să            of
    4              and                   return            to            nu           and
    5               is                       of            in            pe            in
    6               to                     node          that            se            is
    7              was                      for            he            cu          that
    8               it                       to            it            la            ax
    9               he                       is           was    

In [None]:
import pickle
import pandas as pd
from IPython.display import display, HTML

# Load the pickle file
with open('./processed/frequency_analysis.pkl', 'rb') as f:
    results = pickle.load(f)

# Function to create frequency tables for a dataset
def create_frequency_tables(dataset_name, results):
    if dataset_name not in results:
        print(f"Dataset '{dataset_name}' not found!")
        return
    
    data = results[dataset_name]
    char_frequencies = data['character_frequencies']
    category_frequencies = data['category_frequencies']
    
    # Table 1: All characters global percentage
    all_chars_data = []
    for char, freq_data in char_frequencies.items():
        all_chars_data.append({
            'Character': repr(char),  # Use repr to show escape sequences
            'Percentage': f"{freq_data['relative'] * 100:.4f}%"
        })
    
    # Sort by frequency (descending)
    all_chars_data.sort(key=lambda x: float(x['Percentage'].rstrip('%')), reverse=True)
    table1 = pd.DataFrame(all_chars_data)
    
    # Table 2: Letters only
    letters_data = []
    if 'letters' in category_frequencies:
        for char, freq_data in category_frequencies['letters'].items():
            letters_data.append({
                'Character': char,
                'Percentage': f"{freq_data['category_relative'] * 100:.4f}%"
            })
        letters_data.sort(key=lambda x: float(x['Percentage'].rstrip('%')), reverse=True)
    table2 = pd.DataFrame(letters_data)
    
    # Table 3: Symbols only  
    symbols_data = []
    if 'symbols' in category_frequencies:
        for char, freq_data in category_frequencies['symbols'].items():
            symbols_data.append({
                'Character': repr(char),  # Use repr for better visibility of symbols
                'Percentage': f"{freq_data['category_relative'] * 100:.4f}%"
            })
        symbols_data.sort(key=lambda x: float(x['Percentage'].rstrip('%')), reverse=True)
    table3 = pd.DataFrame(symbols_data)
    
    # Table 4: Numbers only
    numbers_data = []
    if 'digits' in category_frequencies:
        for char, freq_data in category_frequencies['digits'].items():
            numbers_data.append({
                'Character': char,
                'Percentage': f"{freq_data['category_relative'] * 100:.4f}%"
            })
        numbers_data.sort(key=lambda x: float(x['Percentage'].rstrip('%')), reverse=True)
    table4 = pd.DataFrame(numbers_data)
    
    return table1, table2, table3, table4

# Function to display tables side by side
def display_tables_side_by_side(dataset_name, results):
    tables = create_frequency_tables(dataset_name, results)
    if tables is None:
        return
    
    table1, table2, table3, table4 = tables
    
    # Convert DataFrames to HTML
    table1_html = table1.to_html(index=False, classes='table-sm', table_id='table1')
    table2_html = table2.to_html(index=False, classes='table-sm', table_id='table2') 
    table3_html = table3.to_html(index=False, classes='table-sm', table_id='table3')
    table4_html = table4.to_html(index=False, classes='table-sm', table_id='table4')
    
    # Create side-by-side layout
    combined_html = f"""
    <style>
        .tables-container {{
            display: flex;
            gap: 20px;
            flex-wrap: wrap;
        }}
        .table-wrapper {{
            flex: 1;
            min-width: 200px;
            max-height: 600px;
            overflow-y: auto;
        }}
        .table-wrapper h4 {{
            text-align: center;
            margin-bottom: 10px;
            color: #333;
        }}
        .table-sm {{
            font-size: 12px;
            width: 100%;
        }}
        .table-sm th, .table-sm td {{
            padding: 4px 8px;
            text-align: left;
        }}
        .table-sm th {{
            background-color: #f8f9fa;
            font-weight: bold;
        }}
        .table-sm tr:nth-child(even) {{
            background-color: #f8f9fa;
        }}
    </style>
    
    <h2>Character Frequency Analysis for: {dataset_name}</h2>
    <div class="tables-container">
        <div class="table-wrapper">
            <h4>All Characters (Global %)</h4>
            {table1_html}
        </div>
        <div class="table-wrapper">
            <h4>Letters Only (%)</h4>
            {table2_html}
        </div>
        <div class="table-wrapper">
            <h4>Symbols Only (%)</h4>
            {table3_html}
        </div>
        <div class="table-wrapper">
            <h4>Numbers Only (%)</h4>
            {table4_html}
        </div>
    </div>
    """
    
    display(HTML(combined_html))

# Display tables for each dataset
print("Available datasets:", list(results.keys()))
print("\n" + "="*80 + "\n")

for dataset_name in results.keys():
    display_tables_side_by_side(dataset_name, results)
    print("\n" + "="*80 + "\n")

Available datasets: ['simple_wikipedia', 'the_algorithms_code', 'gutenberg', 'cartigratis', 'newsgroup']




Character,Percentage
' ',16.1697%
'e',9.0362%
'a',6.7394%
't',5.9729%
'n',5.5977%
'i',5.5493%
'o',5.4126%
'r',4.8917%
's',4.8147%
'h',3.4445%

Character,Percentage
e,11.7394%
a,8.7555%
t,7.7597%
n,7.2722%
i,7.2093%
o,7.0318%
r,6.3550%
s,6.2550%
h,4.4749%
l,3.9748%

Character,Percentage
' ',78.6497%
'\n',5.6012%
'.',5.5504%
"','",4.4773%
"'""'",1.8353%
')',0.8379%
'(',0.8371%
'-',0.8363%
"""'""",0.5149%
'|',0.2795%

Character,Percentage
1,21.5936%
0,17.9999%
2,15.8280%
9,12.7755%
8,6.0855%
3,5.3895%
7,5.2216%
5,5.2144%
6,4.9853%
4,4.9068%






Character,Percentage
' ',23.8369%
'e',6.5553%
't',5.0971%
'r',4.0658%
'i',3.7536%
'n',3.5585%
'a',3.5395%
'\n',3.2972%
's',3.2594%
'o',2.9798%

Character,Percentage
e,12.1961%
t,9.4830%
r,7.5644%
i,6.9835%
n,6.6206%
a,6.5853%
s,6.0641%
o,5.5439%
l,4.0813%
u,3.4138%

Character,Percentage
' ',56.8117%
'\n',7.8584%
"','",4.0425%
')',3.2815%
'(',3.2806%
'.',3.1138%
'=',1.9110%
'/',1.7857%
';',1.7476%
'_',1.4552%

Character,Percentage
1,18.8860%
0,15.7723%
2,12.6107%
3,9.6430%
5,8.3636%
4,8.2095%
6,6.9144%
7,6.6706%
9,6.5387%
8,6.3913%






Character,Percentage
' ',16.3645%
'e',9.5734%
't',6.6509%
'a',5.9682%
'o',5.8744%
'n',5.2249%
'i',4.9385%
's',4.7807%
'h',4.5546%
'r',4.4954%

Character,Percentage
e,12.3995%
t,8.6142%
a,7.7300%
o,7.6085%
n,6.7673%
i,6.3964%
s,6.1920%
h,5.8992%
r,5.8225%
d,4.1760%

Character,Percentage
' ',75.5537%
'\n',9.6765%
"','",6.9096%
'.',4.1895%
';',0.8217%
'-',0.6180%
'?',0.4108%
'_',0.3852%
'!',0.3533%
':',0.2712%

Character,Percentage
1,23.2491%
2,11.8368%
0,10.0680%
3,9.4884%
5,9.3381%
8,8.1127%
4,7.9394%
6,7.7905%
7,6.4323%
9,5.7448%






Character,Percentage
' ',15.6248%
'e',8.9102%
'i',7.6959%
'a',7.6228%
'r',5.1685%
'u',4.8935%
'n',4.8757%
't',4.5224%
'c',3.9280%
'l',3.4080%

Character,Percentage
e,11.7442%
i,10.1436%
a,10.0473%
r,6.8123%
u,6.4500%
n,6.4264%
t,5.9609%
c,5.1774%
l,4.4920%
s,4.2715%

Character,Percentage
' ',73.4304%
'\n',8.2634%
"','",6.3977%
'.',5.0006%
'-',2.9731%
'!',0.7442%
'%',0.6192%
'?',0.5338%
':',0.5062%
'&',0.4171%

Character,Percentage
7,16.3487%
1,15.8791%
0,11.5999%
2,10.9767%
8,9.3307%
5,9.0982%
4,8.0651%
9,7.6729%
3,6.8141%
6,4.2146%






Character,Percentage
' ',16.6518%
'e',7.7053%
't',5.6713%
'o',5.0586%
'a',4.9891%
'i',4.4845%
'n',4.3427%
's',4.1694%
'r',3.9614%
'h',2.8830%

Character,Percentage
e,11.1220%
t,8.1860%
o,7.3017%
a,7.2014%
i,6.4730%
n,6.2684%
s,6.0182%
r,5.7179%
h,4.1615%
l,3.7999%

Character,Percentage
' ',59.0650%
'\n',8.4639%
'.',5.6482%
'-',4.7816%
'>',2.6905%
"','",2.6211%
"""'""",1.7577%
':',1.7497%
'=',1.2433%
')',1.1234%

Character,Percentage
1,16.7532%
0,14.6127%
2,11.1921%
3,9.9532%
5,9.0167%
9,8.6985%
4,8.5710%
6,7.4525%
8,7.1940%
7,6.5560%




