In [25]:
import pandas as pd
from pathlib import Path
import os

print("Libraries imported.")

Libraries imported.


In [16]:
# --- Input File ---
CLEANED_CORPUS_CSV = "text_corpus_UGCE1_10k_cleaned.csv" # Your cleaned CSV file

# --- Output File ---
OUTPUT_HINDI_FILE = "cleaned_hindi.txt"    # Output plain text file for Hindi

# --- Column Name in CSV ---
HINDI_COLUMN = "hi_text"

# --- File Encoding ---
FILE_ENCODING = 'utf-8'

print("Configuration set.")

Configuration set.


In [18]:
cleaned_csv_path = Path(CLEANED_CORPUS_CSV)
output_hindi_path = Path(OUTPUT_HINDI_FILE)

# --- Load Cleaned DataFrame ---
if not cleaned_csv_path.is_file():
    print(f"Error: Cleaned corpus CSV not found at '{cleaned_csv_path}'. Cannot proceed.")
else:
    print(f"Loading cleaned corpus: {cleaned_csv_path}...")
    try:
        df_cleaned = pd.read_csv(cleaned_csv_path)
        print(f"Loaded cleaned corpus with {len(df_cleaned)} rows.")

        # --- Check if Hindi column exists ---
        if HINDI_COLUMN not in df_cleaned.columns:
             print(f"Error: Hindi column '{HINDI_COLUMN}' missing from {cleaned_csv_path}")
             df_cleaned = None # Indicate failure
        else:
            # --- Write the Hindi text to the output file ---
            print(f"Writing Hindi text to '{output_hindi_path}'...")

            # Ensure output directory exists
            output_hindi_path.parent.mkdir(parents=True, exist_ok=True)

            count = 0
            with open(output_hindi_path, 'w', encoding=FILE_ENCODING) as f_out_hindi:
                # Iterate directly over the specified column for efficiency
                for hindi_text in df_cleaned[HINDI_COLUMN]:
                    # Ensure the text is treated as a string, handle potential NaN/None if necessary
                    f_out_hindi.write(str(hindi_text) + '\n')
                    count += 1

            print(f"\nFinished writing Hindi text file.")
            print(f" - Lines written to '{output_hindi_path}': {count}")


    except Exception as e:
        print(f"An error occurred during CSV loading or writing: {e}")

print("\nScript finished.")

Loading cleaned corpus: text_corpus_UGCE1_10k_cleaned.csv...
Loaded cleaned corpus with 9171 rows.
Writing Hindi text to 'cleaned_hindi.txt'...

Finished writing Hindi text file.
 - Lines written to 'cleaned_hindi.txt': 9171

Script finished.


In [27]:
import os
from pathlib import Path
import sys # Still potentially useful for error messages

print("Libraries imported.")

Libraries imported.


In [29]:
# --- !!! EDIT THESE PATHS !!! ---

# Path to the first file (the one you want to potentially rename)
# Should end in .phones or .ph usually
phones_file_path_str = "cleaned_parallel.phones"

# Path to the second file (the one to compare line count against)
other_file_path_str = "cleaned_parallel.hi"

# --- End Configuration ---

# Convert to Path objects
phones_path = Path(phones_file_path_str)
other_path = Path(other_file_path_str)

print(f"File 1 (to check and rename): {phones_path}")
print(f"File 2 (for comparison): {other_path}")

File 1 (to check and rename): cleaned_parallel.phones
File 2 (for comparison): cleaned_parallel.hi


In [31]:
def count_lines(file_path: Path) -> int | None:
    """Counts the number of lines in a file.

    Args:
        file_path: Path object for the file.

    Returns:
        The number of lines as an integer, or None if an error occurs.
    """
    try:
        count = 0
        # Use utf-8 encoding as it's common for text data
        with open(file_path, 'r', encoding='utf-8') as f:
            for _ in f:
                count += 1
        return count
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"Error reading file '{file_path}': {e}")
        return None

print("Line counting function defined.")

Line counting function defined.


In [33]:
def check_and_rename_notebook(phones_file: Path, other_file: Path):
    """
    Checks if two files have the same line count and renames the first
    file from .phones/.ph to .txt if they match. Designed for notebooks.
    """
    print("\n--- Starting Check ---")

    # --- Validate Input Files ---
    if not phones_file.is_file():
        print(f"Error: Input file '{phones_file}' does not exist or is not a file.")
        print("--- Check Aborted ---")
        return # Stop execution in this cell
    if not other_file.is_file():
        print(f"Error: Input file '{other_file}' does not exist or is not a file.")
        print("--- Check Aborted ---")
        return # Stop execution in this cell

    # Check if the first file has a supported extension (optional but good)
    original_suffix = phones_file.suffix.lower() # Get extension like '.phones' or '.ph'
    if original_suffix not in ['.phones', '.ph']:
         print(f"Warning: First file '{phones_file.name}' does not have a standard '.phones' or '.ph' extension.")
         # Decide if you want to stop or proceed
         # print("Proceeding with rename check anyway...")
         # return # Uncomment to stop if extension is wrong

    # --- Count Lines ---
    print(f"Counting lines in '{phones_file.name}'...")
    phones_count = count_lines(phones_file)

    print(f"Counting lines in '{other_file.name}'...")
    other_count = count_lines(other_file)

    if phones_count is None or other_count is None:
        print("Could not count lines in one or both files. Aborting.")
        print("--- Check Aborted ---")
        return

    print(f"\nLine Counts:")
    print(f" - '{phones_file.name}': {phones_count}")
    print(f" - '{other_file.name}': {other_count}")

    # --- Compare and Rename ---
    if phones_count == other_count:
        print("\nLine counts match.")

        # Construct the new filename
        new_txt_path = phones_file.with_suffix('.txt')
        print(f"Attempting to rename '{phones_file.name}' to '{new_txt_path.name}'...")

        # Safety Check: Ensure the target .txt file doesn't already exist
        if new_txt_path.exists():
            print(f"Error: Target file '{new_txt_path}' already exists. Cannot rename.")
            print("Please move or delete the existing .txt file if you want to proceed.")
            print("--- Rename Skipped ---")
            return

        # Perform the rename
        try:
            phones_file.rename(new_txt_path)
            print(f"Successfully renamed '{phones_file.name}' to '{new_txt_path.name}'.")
        except OSError as e:
            print(f"Error renaming file: {e}")
            print("Check file permissions or if the file is in use.")
            print("--- Rename Failed ---")
            return

    else:
        print("\nLine counts DO NOT match.")
        print(f"Files have different lengths ({phones_count} vs {other_count}).")
        print(f"File '{phones_file.name}' was NOT renamed.")

    print("--- Check Finished ---")

print("Main checking/renaming function defined.")

Main checking/renaming function defined.


In [35]:
# Run the check and rename process using the paths defined in Cell 2
check_and_rename_notebook(phones_path, other_path)

print("\nNotebook cell execution finished.")


--- Starting Check ---
Counting lines in 'cleaned_parallel.phones'...
Counting lines in 'cleaned_parallel.hi'...

Line Counts:
 - 'cleaned_parallel.phones': 9171
 - 'cleaned_parallel.hi': 9171

Line counts match.
Attempting to rename 'cleaned_parallel.phones' to 'cleaned_parallel.txt'...
Successfully renamed 'cleaned_parallel.phones' to 'cleaned_parallel.txt'.
--- Check Finished ---

Notebook cell execution finished.
