In [1]:
# Install required libraries (run this first if packages aren't installed)
%pip install PyPDF2 tabula-py pandas openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import PyPDF2
import pandas as pd
import tabula
import os
from pathlib import Path

# Set the PDF file path
pdf_path = "Tamazight-English-Dictionary-2007.pdf"
print(f"Working with PDF: {pdf_path}")
print(f"File exists: {os.path.exists(pdf_path)}")

Working with PDF: Tamazight-English-Dictionary-2007.pdf
File exists: True


In [3]:
# Get basic PDF information without reading all content
def preview_pdf_info(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        
        print(f"PDF Information:")
        print(f"- Total pages: {num_pages}")
        print(f"- File size: {os.path.getsize(pdf_path) / (1024*1024):.2f} MB")
        
        # Read first page sample
        print(f"\n--- Sample from first page ---")
        first_page = pdf_reader.pages[0]
        sample_text = first_page.extract_text()[:500]
        print(sample_text)
        
        return num_pages

num_pages = preview_pdf_info(pdf_path)

PDF Information:
- Total pages: 153
- File size: 3.67 MB

--- Sample from first page ---
 
 
Acknowledgement  
   
 
This monolingual Tamazight Dictionary is the product of several months of serious work, dedication and determination of PCV/ Morocco Stacy Alboher (2005- 2007).  
 Many thanks go to all those who have contributed to the collection of words contained in this dictionary or helped verify the diverse meanings of these words.   While Stacy stated that this work is by no means exhaustive, I strongly believe that it has great utility in helping PCVs / PCTs with their Tamazig


In [4]:
# %pip install jpype1

# Extract all tables from the PDF
print("Extracting tables from PDF...")
try:
    # Extract tables from all pages (this might take a moment)
    all_tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
    print(f"Successfully extracted {len(all_tables)} tables from the PDF")
except Exception as e:
    print(f"Error extracting tables: {e}")
    # Fallback: try first 10 pages
    all_tables = tabula.read_pdf(pdf_path, pages='1-10', multiple_tables=True)
    print(f"Extracted {len(all_tables)} tables from first 10 pages")

Extracting tables from PDF...
Successfully extracted 238 tables from the PDF


In [5]:
# Display information about each table and show samples
for i, table in enumerate(all_tables):  # Show first 5 tables
    print(f"\n{'='*60}")
    print(f"TABLE {i+1}")
    print(f"{'='*60}")
    print(f"Shape: {table.shape} (rows x columns)")
    print(f"Columns: {list(table.columns)}")
    
    # Clean the table (remove empty rows/columns)
    all_tables[i] = table.dropna(how='all').dropna(axis=1, how='all')
    print(f"Shape after cleaning: {all_tables[i].shape}")
    
    print(f"\nFirst 5 rows:")
    print(all_tables[i].head())
    
    if len(all_tables) > 5 and i == 4:
        print(f"\n... and {len(all_tables) - 5} more tables")


TABLE 1
Shape: (57, 3) (rows x columns)
Columns: ['1', 'yan; yiwn/yat, yiwt', 'Unnamed: 0']
Shape after cleaning: (38, 2)

First 5 rows:
   1 yan; yiwn/yat, yiwt
0  2            sin/snat
1  3          krad/kradt
2  4                rbعa
3  5                xmsa
4  6                 sţa

TABLE 2
Shape: (39, 2) (rows x columns)
Columns: ['a person or place\r(indicates movement)', 'Zars']
Shape after cleaning: (39, 2)

First 5 rows:
  a person or place\r(indicates movement)       Zars
0                             a while ago   šHal aya
1                       abnormal (person)       nHya
2                                abortion    l-ijhaď
3                                   about    ġif; xf
4                                   above  afla; nig

TABLE 3
Shape: (40, 2) (rows x columns)
Columns: ['Allah (figurative\rspeech)', 'bu itran']
Shape after cleaning: (40, 2)

First 5 rows:
  Allah (figurative\rspeech)                bu itran
0                  alligator                 ttimsaH
1  

In [6]:
lengths = []
for t in all_tables:
    lengths.append(len(t.columns))

print(lengths)

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 3, 3]


In [7]:
for i, t in enumerate(all_tables):
    print(([i, len(t.columns)], t.iloc[-1].to_numpy()))

([0, 2], array(['a lot (number)', '.عtan; gudin'], dtype=object))
([1, 2], array(['Allah', 'rbbi'], dtype=object))
([2, 2], array(['art', 'Lfnn'], dtype=object))
([3, 2], array(['bangles', 'dama'], dtype=object))
([4, 2], array(['beetle', 'Abxxuš'], dtype=object))
([5, 2], array(['blister (n)', 'tilfxt'], dtype=object))
([6, 2], array(['bracelet', 'azbi; azvi; tazvit;'], dtype=object))
([7, 2], array(['bubble (noun)', 'tabuwlt; tibiwlt'], dtype=object))
([8, 2], array(['calf (cow)', 'Taعjliyt'], dtype=object))
([9, 2], array(['cedar', 'larz'], dtype=object))
([10, 2], array(['church', 'lkanisa'], dtype=object))
([11, 2], array(['come here!', 'addud'], dtype=object))
([12, 2], array(['cough (n)', 'takuHut; tusut'], dtype=object))
([13, 2], array(['curry (n)', 'lkurnb'], dtype=object))
([14, 2], array(['director', 'lmudir'], dtype=object))
([15, 2], array(['drawing of water', 'igm'], dtype=object))
([16, 2], array(['eggs', 'tiglay'], dtype=object))
([17, 2], array(['extremist', 'mutatari

In [8]:
print(all_tables[232].iloc[-1].to_numpy())

[nan nan 'Honduras' 'alhinduras']


In [9]:
for i, t in enumerate(all_tables):
    #display the index of the table if it contains na or not
    print(i) if t.isna().any().to_numpy().any() else None

34
52
67
68
86
125
144
165
182
183
232


In [10]:
# Corrected section boundaries based on your data
sections = {
    'english_tamazight': {'start': 0, 'end': 86, 'columns': 2, 'has_header': False},
    'tamazight_english': {'start': 87, 'end': 183, 'columns': 2, 'has_header': False},
    'verbs': {'start': 184, 'end': 231, 'columns': 4, 'has_header': True},
    'countries': {'start': 232, 'end': 235, 'columns': 2, 'has_header': False},
    'god_phrases': {'start': 236, 'end': 237, 'columns': 3, 'has_header': True}
}

# Tables that need to be split (4 columns -> two 2-column tables)
tables_to_split = [34, 67, 86, 125, 144, 165, 182, 183, 232]

print("Corrected Section Analysis:")
for section_name, info in sections.items():
    count = info['end'] - info['start'] + 1
    print(f"{section_name}: Tables {info['start']}-{info['end']} ({count} tables, {info['columns']} columns)")

print(f"\nTables to split: {tables_to_split}")
print(f"Original table count: {len(all_tables)}")

Corrected Section Analysis:
english_tamazight: Tables 0-86 (87 tables, 2 columns)
tamazight_english: Tables 87-183 (97 tables, 2 columns)
verbs: Tables 184-231 (48 tables, 4 columns)
countries: Tables 232-235 (4 tables, 2 columns)
god_phrases: Tables 236-237 (2 tables, 3 columns)

Tables to split: [34, 67, 86, 125, 144, 165, 182, 183, 232]
Original table count: 238


In [11]:
# Function to split 4-column tables into two 2-column tables
def split_table(table, table_idx):
    """Split a 4-column table into two 2-column tables"""
    if table.shape[1] >= 4:
        # Split into first 2 columns and last 2 columns
        table1 = table.iloc[:, :2].copy()
        table2 = table.iloc[:, 2:4].copy()
        
        print(f"  Split table {table_idx}: {table.shape} -> {table1.shape} + {table2.shape}")
        return [table1, table2]
    else:
        print(f"  Warning: Table {table_idx} has only {table.shape[1]} columns, cannot split")
        return [table]

# Create new list with split tables
split_tables = []
split_mapping = {}  # Track original index to new indices

for i, table in enumerate(all_tables):
    if i in tables_to_split:
        split_result = split_table(table, i)
        start_idx = len(split_tables)
        split_tables.extend(split_result)
        split_mapping[i] = list(range(start_idx, len(split_tables)))
        print(f"Table {i} split into indices {split_mapping[i]}")
    else:
        split_tables.append(table)
        split_mapping[i] = [len(split_tables) - 1]

print(f"\nTotal tables after splitting: {len(split_tables)} (expected 247)")

  Split table 34: (54, 4) -> (54, 2) + (54, 2)
Table 34 split into indices [34, 35]
  Split table 67: (53, 4) -> (53, 2) + (53, 2)
Table 67 split into indices [68, 69]
  Split table 86: (27, 4) -> (27, 2) + (27, 2)
Table 86 split into indices [88, 89]
  Split table 125: (43, 4) -> (43, 2) + (43, 2)
Table 125 split into indices [128, 129]
  Split table 144: (44, 4) -> (44, 2) + (44, 2)
Table 144 split into indices [148, 149]
  Split table 165: (43, 4) -> (43, 2) + (43, 2)
Table 165 split into indices [170, 171]
  Split table 182: (43, 4) -> (43, 2) + (43, 2)
Table 182 split into indices [188, 189]
  Split table 183: (42, 4) -> (42, 2) + (42, 2)
Table 183 split into indices [190, 191]
  Split table 232: (42, 4) -> (42, 2) + (42, 2)
Table 232 split into indices [240, 241]

Total tables after splitting: 247 (expected 247)


In [12]:
# Clean all tables and remove any rows with NaN/null values
cleaned_tables = []

for i, table in enumerate(split_tables):
    # Basic cleaning
    cleaned = table.dropna(how='all').dropna(axis=1, how='all')
    
    # Drop any rows that contain NaN values
    cleaned = cleaned.dropna()
    
    # Remove empty string rows if any
    if len(cleaned.columns) > 0:
        # Check if all values in a row are empty strings
        mask = ~(cleaned == '').all(axis=1)
        cleaned = cleaned[mask]
    
    cleaned_tables.append(cleaned)
    
    if i < 10 or len(cleaned) == 0:  # Show first 10 and any empty tables
        print(f"Table {i}: {table.shape} -> {cleaned.shape}")

print(f"\nCleaned {len(cleaned_tables)} tables")

# Update all_tables reference
all_tables = cleaned_tables

Table 0: (38, 2) -> (38, 2)
Table 1: (39, 2) -> (39, 2)
Table 2: (40, 2) -> (40, 2)
Table 3: (40, 2) -> (40, 2)
Table 4: (39, 2) -> (39, 2)
Table 5: (39, 2) -> (39, 2)
Table 6: (38, 2) -> (38, 2)
Table 7: (38, 2) -> (38, 2)
Table 8: (37, 2) -> (37, 2)
Table 9: (36, 2) -> (36, 2)

Cleaned 247 tables


In [13]:
for i, t in enumerate(all_tables):
    print(([i, len(t.columns)], t.iloc[-1].to_numpy()))

([0, 2], array(['a lot (number)', '.عtan; gudin'], dtype=object))
([1, 2], array(['Allah', 'rbbi'], dtype=object))
([2, 2], array(['art', 'Lfnn'], dtype=object))
([3, 2], array(['bangles', 'dama'], dtype=object))
([4, 2], array(['beetle', 'Abxxuš'], dtype=object))
([5, 2], array(['blister (n)', 'tilfxt'], dtype=object))
([6, 2], array(['bracelet', 'azbi; azvi; tazvit;'], dtype=object))
([7, 2], array(['bubble (noun)', 'tabuwlt; tibiwlt'], dtype=object))
([8, 2], array(['calf (cow)', 'Taعjliyt'], dtype=object))
([9, 2], array(['cedar', 'larz'], dtype=object))
([10, 2], array(['church', 'lkanisa'], dtype=object))
([11, 2], array(['come here!', 'addud'], dtype=object))
([12, 2], array(['cough (n)', 'takuHut; tusut'], dtype=object))
([13, 2], array(['curry (n)', 'lkurnb'], dtype=object))
([14, 2], array(['director', 'lmudir'], dtype=object))
([15, 2], array(['drawing of water', 'igm'], dtype=object))
([16, 2], array(['eggs', 'tiglay'], dtype=object))
([17, 2], array(['extremist', 'mutatari

In [14]:
# Create mapping from new table indices to sections
def get_section_for_original_index(orig_idx):
    """Get section name for original table index"""
    for section_name, info in sections.items():
        if info['start'] <= orig_idx <= info['end']:
            return section_name
    return None

# Build new section mappings
new_sections = {name: [] for name in sections.keys()}
table_to_section = {}

current_new_idx = 0
for orig_idx in range(len(split_mapping)):
    section_name = get_section_for_original_index(orig_idx)
    if section_name:
        # Get the new indices for this original table
        new_indices = split_mapping[orig_idx]
        for new_idx in new_indices:
            if new_idx < len(all_tables):
                new_sections[section_name].append(all_tables[new_idx])
                table_to_section[new_idx] = section_name

print("Tables per section after splitting:")
for section_name, tables in new_sections.items():
    print(f"{section_name}: {len(tables)} tables")
    
    # Show sample from first table in each section
    if tables and len(tables[0]) > 0:
        print(f"  Sample from first table: {tables[0].shape}")
        print(f"  Columns: {list(tables[0].columns)}")
        if len(tables[0]) > 0:
            print(f"  First row: {tables[0].iloc[0].tolist()}")
            print(f"  Last row: {tables[0].iloc[-1].tolist()} ")
        
        print(f"  Sample from last table: {tables[-1].shape}")
        print(f"  Columns: {list(tables[-1].columns)}")
        if len(tables[-1]) > 0:
            print(f"  First row: {tables[-1].iloc[0].tolist()}")
            print(f"  Last row: {tables[-1].iloc[-1].tolist()} ")

Tables per section after splitting:
english_tamazight: 90 tables
  Sample from first table: (38, 2)
  Columns: ['1', 'yan; yiwn/yat, yiwt']
  First row: ['2', 'sin/snat']
  Last row: ['a lot (number)', '.عtan; gudin'] 
  Sample from last table: (22, 2)
  Columns: ['you (F, pl) (indirect', '-awnt']
  First row: ['you (M)', 'ky']
  Last row: ['zoo', 'lHadiqa n lHayawan'] 
tamazight_english: 102 tables
  Sample from first table: (35, 2)
  Columns: ['.عada', 'to infect']
  First row: ['.عadi', 'normal']
  Last row: ['.عniġ', 'I think so; maybe;\rperhaps; probably'] 
  Sample from last table: (33, 2)
  Columns: ['zwar', 'first (adv)']
  First row: ['zwu', 'to dry']
  Last row: ['zعtut', 'monkey; barbary'] 
verbs: 48 tables
  Sample from first table: (27, 4)
  Columns: ['English', 'Infinitive', 'Present continuous', '1st person past']
  First row: ['to absorb', 'su', 'ssa', 'swiġ']
  Last row: ['to applaud', 'ut rršš', 'kkat rršš', 'wtġ rršš'] 
  Sample from last table: (22, 4)
  Columns: ['

In [15]:
# Fix column names by converting them to data rows where needed
def fix_table_headers(section_tables, section_name, section_info):
    """Convert misinterpreted column names back to data rows"""
    fixed_tables = []
    
    for i, table in enumerate(section_tables):
        if len(table) == 0:
            fixed_tables.append(table)
            continue
            
        # Create a copy to avoid modifying original
        fixed_table = table.copy()
        
        if section_info['has_header'] and i == 0:
            # First table in Verbs/God Phrases: keep original headers
            print(f"  Table {i}: Keeping original headers {list(fixed_table.columns)}")
            
        else:
            # All other tables: column names are actually data
            current_columns = list(fixed_table.columns)
            print(f"  Table {i}: Converting column names to data row: {current_columns}")
            
            # Create new row from column names
            new_row = pd.DataFrame([current_columns], columns=fixed_table.columns)
            
            # Add this row as first row
            fixed_table = pd.concat([new_row, fixed_table], ignore_index=True)
            
            # Set proper column names based on section
            if section_name == 'english_tamazight':
                fixed_table.columns = ['EN_source', 'ZGH_target']
            elif section_name == 'tamazight_english':
                fixed_table.columns = ['ZGH_source', 'EN_target']
            elif section_name == 'countries':
                fixed_table.columns = ['Country_EN', 'Country_ZGH']
            elif section_name == 'verbs':
                # Use headers from first table
                fixed_table.columns = ['English', 'Infinitive', 'Present continuous', '1st person past']
            elif section_name == 'god_phrases':
                # Use headers from first table
                fixed_table.columns = ['Phrase', 'English translation', 'When used']
        
        fixed_tables.append(fixed_table)
    
    return fixed_tables

# Apply header fixes to all sections
print("Fixing column headers across all sections...")
fixed_sections = {}

for section_name, section_tables in new_sections.items():
    print(f"\nFixing {section_name}:")
    section_info = sections[section_name]
    fixed_tables = fix_table_headers(section_tables, section_name, section_info)
    fixed_sections[section_name] = fixed_tables
    
    # Show sample of first fixed table
    if fixed_tables and len(fixed_tables[0]) > 0:
        print(f"  Sample after fixing - Shape: {fixed_tables[0].shape}")
        print(f"  Columns: {list(fixed_tables[0].columns)}")
        print(f"  First row: {fixed_tables[0].iloc[0].tolist()}")

Fixing column headers across all sections...

Fixing english_tamazight:
  Table 0: Converting column names to data row: ['1', 'yan; yiwn/yat, yiwt']
  Table 1: Converting column names to data row: ['a person or place\r(indicates movement)', 'Zars']
  Table 2: Converting column names to data row: ['Allah (figurative\rspeech)', 'bu itran']
  Table 3: Converting column names to data row: ['artist', 'Lfnnan']
  Table 4: Converting column names to data row: ['bangs (hair)', 'tawnza']
  Table 5: Converting column names to data row: ['beets', 'Lbarba']
  Table 6: Converting column names to data row: ['blond', 'ašhabun; azrwal']
  Table 7: Converting column names to data row: ['Unnamed: 0', 'dblij**']
  Table 8: Converting column names to data row: ['bubbles', 'tibiwlin']
  Table 9: Converting column names to data row: ['calf (female)', 'Tigizt']
  Table 10: Converting column names to data row: ['ceiling*', 'sqf']
  Table 11: Converting column names to data row: ['churn, made from\ranimal skin

In [16]:
# Combine the fixed tables within each section
final_sections = {}

for section_name, section_tables in fixed_sections.items():
    if not section_tables:
        final_sections[section_name] = pd.DataFrame()
        continue
    
    print(f"\nCombining {section_name}...")
    
    # Filter out empty tables
    non_empty_tables = [t for t in section_tables if len(t) > 0]
    
    if not non_empty_tables:
        final_sections[section_name] = pd.DataFrame()
        continue
    
    # Simply concatenate all tables (headers are now properly handled)
    combined_df = pd.concat(non_empty_tables, ignore_index=True)
    
    # Final cleanup - remove any remaining NaN rows
    combined_df = combined_df.dropna()
    
    final_sections[section_name] = combined_df
    print(f"  Final shape: {combined_df.shape}")
    print(f"  Columns: {list(combined_df.columns)}")
    
    # Show sample
    if len(combined_df) > 0:
        print(f"  Sample entries:")
        for i in range(min(3, len(combined_df))):
            print(f"    {combined_df.iloc[i].tolist()}")
        
        # Show last few entries to verify data integrity
        print(f"  Last entries:")
        for i in range(max(0, len(combined_df)-2), len(combined_df)):
            print(f"    {combined_df.iloc[i].tolist()}")


Combining english_tamazight...
  Final shape: (3548, 2)
  Columns: ['EN_source', 'ZGH_target']
  Sample entries:
    ['1', 'yan; yiwn/yat, yiwt']
    ['2', 'sin/snat']
    ['3', 'krad/kradt']
  Last entries:
    ['zipper', 'ssnslt']
    ['zoo', 'lHadiqa n lHayawan']

Combining tamazight_english...
  Final shape: (3959, 2)
  Columns: ['ZGH_source', 'EN_target']
  Sample entries:
    ['.عada', 'to infect']
    ['.عadi', 'normal']
    ['.عanwa', 'wrong (not true)']
  Last entries:
    ['zعm ġif', 'to be comfortable']
    ['zعtut', 'monkey; barbary']

Combining verbs...
  Final shape: (1384, 4)
  Columns: ['English', 'Infinitive', 'Present continuous', '1st person past']
  Sample entries:
    ['to absorb', 'su', 'ssa', 'swiġ']
    ['to accept', 'qbl', 'tqbal', 'qblġ']
    ['to accept', 'qvl', 'tqval', 'qvlġ']
  Last entries:
    ['to yawn', 'fa', 'tfa', 'faġ']
    ['to yell', 'sġuy', 'sġuyyu', 'sġuyġ']

Combining countries...
  Final shape: (151, 2)
  Columns: ['Country_EN', 'Country_ZGH'

In [17]:
def clean_for_hf_export(df):
    """Clean DataFrame to ensure HF datasets compatibility"""
    cleaned_df = df.copy()
    
    # Replace problematic characters within text
    for col in cleaned_df.columns:
        if cleaned_df[col].dtype == 'object':  # String columns
            cleaned_df[col] = cleaned_df[col].astype(str).str.replace('\t', ' ', regex=False)
            cleaned_df[col] = cleaned_df[col].str.replace('\n', ' ', regex=False)
            cleaned_df[col] = cleaned_df[col].str.replace('\r', ' ', regex=False)
            # Remove extra whitespace
            cleaned_df[col] = cleaned_df[col].str.strip()
    
    # Remove rows where any cell is empty or just whitespace
    mask = True
    for col in cleaned_df.columns:
        mask = mask & (cleaned_df[col].str.strip() != '') & (cleaned_df[col] != 'nan')
    
    cleaned_df = cleaned_df[mask]
    return cleaned_df

def save_hf_compatible_tsv(df, filepath):
    """Save DataFrame as HF-compatible TSV with proper cleaning"""
    # Clean the data first
    cleaned_df = clean_for_hf_export(df)
    
    # Save with proper TSV formatting
    cleaned_df.to_csv(filepath, sep='\t', index=False, encoding='utf-8', 
                     lineterminator='\n', quoting=1)  # quoting=1 ensures proper escaping
    
    return len(cleaned_df)

# Save each section as HF-compatible TSV
output_dir = Path("data")
output_dir.mkdir(exist_ok=True)

saved_files = []

for section_name, df in final_sections.items():
    if len(df) == 0:
        print(f"\nSkipping empty section: {section_name}")
        continue
    
    filename = output_dir / f"{section_name}.tsv"
    
    # Save with HF-compatible cleaning
    final_rows = save_hf_compatible_tsv(df, filename)
    saved_files.append(filename)
    
    print(f"\nSaved {section_name}: {len(df)} -> {final_rows} rows -> {filename}")

print(f"\nFINAL SUMMARY:")
print(f"- Split 9 concatenated tables into 18 separate tables")
print(f"- Total tables processed: 247")
print(f"- All NaN/null rows removed")
print(f"- Applied HF datasets compatibility cleaning")
print(f"- Saved {len(saved_files)} TSV files in 'data/'")


Saved english_tamazight: 3548 -> 3548 rows -> data\english_tamazight.tsv

Saved tamazight_english: 3959 -> 3959 rows -> data\tamazight_english.tsv

Saved verbs: 1384 -> 1384 rows -> data\verbs.tsv

Saved countries: 151 -> 151 rows -> data\countries.tsv

Saved god_phrases: 31 -> 31 rows -> data\god_phrases.tsv

FINAL SUMMARY:
- Split 9 concatenated tables into 18 separate tables
- Total tables processed: 247
- All NaN/null rows removed
- Applied HF datasets compatibility cleaning
- Saved 5 TSV files in 'data/'
