In [4]:
import pandas as pd

# --- 1. Load Data ---
print("Loading data...")
try:
    labeled_df = pd.read_csv("labeled_morpheme_list.csv")
    nahuatl_names_df = pd.read_csv("nahuatl_names_review.csv")
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: One of the CSV files was not found. Please ensure both 'labeled_morpheme_list.csv' and 'nahuatl_names_review.csv' are in the same directory as this script.")
    print(f"Details: {e}")
    exit() # Exit if files are not found

# --- 2. Process labeled_morpheme_list.csv ---
print("Processing labeled_morpheme_list.csv...")

# Drop the 'morphemes' column if it exists
if 'morphemes' in labeled_df.columns:
    labeled_df = labeled_df.drop(columns=['morphemes'])
    print("Dropped 'morphemes' column from labeled_morpheme_list.csv.")

# Group by 'ID' and aggregate
# For redundant columns, we can just take the first entry as they are expected to be identical.
# For 'Label', collect all unique entries and join them with a semicolon and a space.
processed_labeled_df = labeled_df.groupby('ID').agg(
    official_name=('official_name', 'first'),
    type=('type', 'first'),
    text_subchapters=('text_subchapters', 'first'), # Corrected column name
    illustrations=('illustrations', 'first'),
    synonyms=('synonyms', 'first'),
    Label=('Label', lambda x: '; '.join(x.dropna().unique())) # Separated by '; '
).reset_index()

print(f"Processed {len(processed_labeled_df)} unique IDs from labeled_morpheme_list.csv.")

# --- 3. Identify Missing IDs from nahuatl_names_review.csv ---
print("Identifying entries with no morphemes...")

# Get IDs from the processed labeled_df
ids_with_morphemes = set(processed_labeled_df['ID'])

# Filter nahuatl_names_df for IDs not found in ids_with_morphemes
missing_ids_df = nahuatl_names_df[~nahuatl_names_df['ID'].isin(ids_with_morphemes)].copy()

# Select relevant columns and add the 'Label' column as "NA"
if not missing_ids_df.empty:
    # Ensure all required columns exist before selecting
    required_cols = ["ID", "official_name", "type", "text_subchapters", "illustrations", "synonyms"] # Corrected column name
    for col in required_cols:
        if col not in missing_ids_df.columns:
            print(f"Warning: Column '{col}' not found in 'nahuatl_names_review.csv'. It will be added as NaN.")
            missing_ids_df[col] = pd.NA

    missing_ids_df = missing_ids_df[required_cols]
    missing_ids_df['Label'] = "NA"
    print(f"Found {len(missing_ids_df)} IDs with no morphemes to add.")
else:
    print("No IDs found in nahuatl_names_review.csv that were missing from labeled_morpheme_list.csv.")

# --- 4. Combine DataFrames ---
print("Combining dataframes...")
final_df = pd.concat([processed_labeled_df, missing_ids_df], ignore_index=True)
print(f"Final DataFrame has {len(final_df)} entries.")

# --- 5. Save the primary output ---
output_filename_main = "NAHUATL_FOR_REVIEW.csv"
final_df.to_csv(output_filename_main, index=False)
print(f"Final merged DataFrame saved to '{output_filename_main}'")

# --- 6. Process 'Label' column for unique morpheme and translation pairs ---
print("\nProcessing 'Label' column for unique morpheme and translation pairs...")
unique_morphemes = set()

for label_entry in final_df['Label'].dropna():
    if label_entry != "NA": # Skip entries that were 'NA' for labels
        # Split by '; ' to get individual morpheme strings
        morpheme_strings = label_entry.split('; ')
        for ms in morpheme_strings:
            # Split by '/' to get Nahuatl morpheme and English translation
            parts = ms.split('/', 1) # Use 1 to split only on the first slash in case translation has slashes
            if len(parts) == 2:
                morpheme = parts[0].strip()
                translation = parts[1].strip()
                unique_morphemes.add((morpheme, translation))
            elif len(parts) == 1:
                # If only one part (e.g., "MorphemeWithoutTranslation"), add it with empty translation
                unique_morphemes.add((parts[0].strip(), ""))
            # Else, ignore malformed entries for now

# Create a DataFrame from the unique morpheme pairs
morpheme_df = pd.DataFrame(list(unique_morphemes), columns=['morpheme', 'translation'])

# Save the morpheme DataFrame
output_filename_morpheme = "MORPHEME_FOR_REVIEW.csv"
morpheme_df.to_csv(output_filename_morpheme, index=False)
print(f"Unique morpheme and translation pairs saved to '{output_filename_morpheme}'")

print("\nScript execution complete!")

Loading data...
Data loaded successfully.
Processing labeled_morpheme_list.csv...
Dropped 'morphemes' column from labeled_morpheme_list.csv.
Processed 208 unique IDs from labeled_morpheme_list.csv.
Identifying entries with no morphemes...
Found 57 IDs with no morphemes to add.
Combining dataframes...
Final DataFrame has 265 entries.
Final merged DataFrame saved to 'NAHUATL_FOR_REVIEW.csv'

Processing 'Label' column for unique morpheme and translation pairs...
Unique morpheme and translation pairs saved to 'MORPHEME_FOR_REVIEW.csv'

Script execution complete!
