# Move concise and detailed to different folder

In [2]:
import os
import glob
import pandas as pd

# Define input and output directories.
input_folder = '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/concise'
output_folder = '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/polysemy_concise'

# Create the output directory if it doesn't exist.
os.makedirs(output_folder, exist_ok=True)

# Retrieve all CSV files in the input folder.
csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

for csv_file in csv_files:
    # Read the CSV file.
    df = pd.read_csv(csv_file)
    
    # Check that every value in the 'polysemic' column is either 0 or 1.
    if not df['polysemic'].isin([0, 1]).all():
        raise ValueError(f"File {os.path.basename(csv_file)} has invalid 'polysemic' values.")
    
    # Filter rows where 'polysemic' equals 1.
    df_poly = df[df['polysemic'] == 1]
    
    # Define the output file path (same file name as input).
    output_file = os.path.join(output_folder, os.path.basename(csv_file))
    
    # Save the filtered rows to the new CSV.
    df_poly.to_csv(output_file, index=False)

In [4]:
import os
import glob
import pandas as pd

# Define input and output directories.
input_folder = '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed'
output_folder = '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/polysemy_detailed'

# Create the output directory if it doesn't exist.
os.makedirs(output_folder, exist_ok=True)

# Retrieve all CSV files in the input folder.
csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

for csv_file in csv_files:
    # Read the CSV file.
    df = pd.read_csv(csv_file)
    
    # Check that every value in the 'polysemic' column is either 0 or 1.
    if not df['polysemic'].isin([0, 1]).all():
        raise ValueError(f"File {os.path.basename(csv_file)} has invalid 'polysemic' values.")
    
    # Filter rows where 'polysemic' equals 1.
    df_poly = df[df['polysemic'] == 1]
    
    # Define the output file path (same file name as input).
    output_file = os.path.join(output_folder, os.path.basename(csv_file))
    
    # Save the filtered rows to the new CSV.
    df_poly.to_csv(output_file, index=False)


# Put data back

In [5]:
import os
import glob
import pandas as pd

input_folder = '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/polysemy_detailed'
output_folder = '/local1/bryanzhou008/Dialect/multimodal-dialectal-bias/data/text/detailed'

# Iterate through all CSV files in the input folder
for input_file in glob.glob(os.path.join(input_folder, "*.csv")):
    filename = os.path.basename(input_file)
    output_file = os.path.join(output_folder, filename)
    print(f"Processing file: {filename}")
    
    # Load the input and output CSV files
    input_df = pd.read_csv(input_file)
    output_df = pd.read_csv(output_file)
    
    # Get unique Dialect_Word values from the input CSV
    unique_words = input_df["Dialect_Word"].unique()
    
    # For each unique Dialect_Word, check counts and replace corresponding rows
    for word in unique_words:
        input_rows = input_df[input_df["Dialect_Word"] == word]
        output_rows = output_df[output_df["Dialect_Word"] == word]
        
        if len(input_rows) != 6 or len(output_rows) != 6:
            raise ValueError(f"In file '{filename}', for Dialect_Word '{word}', expected exactly 6 rows in both input and output CSVs. Got {len(input_rows)} and {len(output_rows)} respectively.")
        
        # Replace the rows in output_df with the rows from input_df.
        # The replacement is done in-place, preserving the overall order of the output CSV.
        new_rows = input_rows.reset_index(drop=True)
        output_df.loc[output_rows.index] = new_rows.values
    
    # Save the updated output CSV
    output_df.to_csv(output_file, index=False)


Processing file: aae.csv
Processing file: bre.csv
Processing file: che.csv
Processing file: ine.csv
Processing file: sge.csv
