# Change Column Structure

In [1]:
import pandas as pd

# Path to the CSV file
file_path = "/Users/bryan/Desktop/wkdir/Dialect/multimodal-dialectal-bias/data/text/basic/ine.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Define the new column order
new_order = [
    "Dialect_Word",
    "SAE_Word",
    "Dialect_Prompt",
    "SAE_Prompt",
    "person_in_prompt",
    "dialect_word_count",
    "polysemic",
    "Polysemy_Prompt"
]

# Reorder the DataFrame columns
df = df[new_order]

# Optionally, write the updated DataFrame back to the CSV file
# To overwrite the original file, uncomment the line below:
# df.to_csv(file_path, index=False)

# Or save to a new file:
output_file_path = "/Users/bryan/Desktop/wkdir/Dialect/multimodal-dialectal-bias/data/text/basic/ine_reordered.csv"
df.to_csv(output_file_path, index=False)

# Display the first few rows of the updated DataFrame
df.head()


Unnamed: 0,Dialect_Word,SAE_Word,Dialect_Prompt,SAE_Prompt,person_in_prompt,dialect_word_count,polysemic,Polysemy_Prompt
0,math,monastery,an old math,an old monastery,0,1,1,a student solving math problems on a chalkboard
1,math,monastery,a math on a hill,a monastery on a hill,0,1,1,a professor exploring complex math equations
2,math,monastery,a small math,a small monastery,0,1,1,a teacher explaining math concepts to a class
3,math,monastery,a painting of a math,a painting of a monastery,0,1,1,a pile of math textbooks on a desk
4,math,monastery,a busy math,a busy monastery,1,1,1,a shirt with math equations


In [6]:
import pandas as pd

# Path to the CSV file
file_path = "/Users/bryan/Desktop/wkdir/Dialect/multimodal-dialectal-bias/data/text/basic/sge.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Add the missing columns:
# - dialect_word_count should be filled with 1s
# - Polysemy_Prompt should be empty strings
df['dialect_word_count'] = 1
df['Polysemy_Prompt'] = ""

# Define the new column order
new_order = [
    "Dialect_Word",
    "SAE_Word",
    "Dialect_Prompt",
    "SAE_Prompt",
    "person_in_prompt",
    "dialect_word_count",
    "polysemic",
    "Polysemy_Prompt"
]

# Reorder the DataFrame columns
df = df[new_order]

# Overwrite the original CSV file with the updated DataFrame
df.to_csv(file_path, index=False)

# Display the first few rows of the updated DataFrame for verification
df.head()


Unnamed: 0,Dialect_Word,SAE_Word,Dialect_Prompt,SAE_Prompt,person_in_prompt,dialect_word_count,polysemic,Polysemy_Prompt
0,teh tarik,milk tea,a painting of teh tarik,a painting of milk tea,0,1,0,
1,teh tarik,milk tea,a photo of teh tarik,a photo of milk tea,0,1,0,
2,teh tarik,milk tea,a shop selling teh tarik,a shop selling milk tea,0,1,0,
3,teh tarik,milk tea,a large teh tarik,a large milk tea,1,1,0,
4,teh tarik,milk tea,a small teh tarik,a small milk tea,1,1,0,


In [11]:
import pandas as pd

# Path to the CSV file
file_path = "/Users/bryan/Desktop/wkdir/Dialect/multimodal-dialectal-bias/data/text/complex/sge.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Add the missing columns:
# - 'polysemic' should be filled with 0s
# - 'Polysemy_Prompt' should be empty strings
df['polysemic'] = 0
df['Polysemy_Prompt'] = ""

# Define the new column order
new_order = [
    "Dialect_Word",
    "SAE_Word",
    "Dialect_Prompt",
    "SAE_Prompt",
    "person_in_prompt",
    "dialect_word_count",
    "polysemic",
    "Polysemy_Prompt"
]

# Reorder the DataFrame columns
df = df[new_order]

# Overwrite the original CSV file with the updated DataFrame
df.to_csv(file_path, index=False)

# Optionally, display the first few rows for verification
print(df.head())


  Dialect_Word  SAE_Word                 Dialect_Prompt  \
0    teh tarik  milk tea   a plastic cup with teh tarik   
1    teh tarik  milk tea  a cup of teh tarik on a table   
2    teh tarik  milk tea       a shop selling teh tarik   
3    teh tarik  milk tea       a man drinking teh tarik   
4    teh tarik  milk tea     a woman drinking teh tarik   

                      SAE_Prompt  person_in_prompt  dialect_word_count  \
0    a plastic cup with milk tea                 0                   1   
1  a cup of teh tarik on a table                 0                   1   
2        a shop selling milk tea                 0                   1   
3        a man drinking milk tea                 1                   1   
4      a woman drinking milk tea                 1                   1   

   polysemic Polysemy_Prompt  
0          0                  
1          0                  
2          0                  
3          0                  
4          0                  


In [25]:
import pandas as pd

# Define the file paths
dialect = "ine"
basic_file = f"/Users/bryan/Desktop/wkdir/Dialect/multimodal-dialectal-bias/data/text/basic/{dialect}.csv"
complex_file = f"/Users/bryan/Desktop/wkdir/Dialect/multimodal-dialectal-bias/data/text/complex/{dialect}.csv"

# Read the CSV files into DataFrames
df_basic = pd.read_csv(basic_file)
df_complex = pd.read_csv(complex_file)

# Check that both files have the same number of rows
if len(df_basic) != len(df_complex):
    print(f"Row count mismatch: basic file has {len(df_basic)} rows while complex file has {len(df_complex)} rows.")
else:
    print(f"Both files have the same number of rows: {len(df_basic)}")

# Compare the 'Dialect_Word' column element-wise
# (assuming the rows are in the same order)
mismatch_mask = df_basic["Dialect_Word"] != df_complex["Dialect_Word"]

if mismatch_mask.any():
    print("Found mismatches in the 'Dialect_Word' column at the following rows:")
    mismatches = pd.DataFrame({
        "Row": df_basic.index[mismatch_mask],
        "Basic_Dialect_Word": df_basic["Dialect_Word"][mismatch_mask],
        "Complex_Dialect_Word": df_complex["Dialect_Word"][mismatch_mask]
    })
    print(mismatches)
else:
    print("The 'Dialect_Word' column matches exactly in both files.")


Both files have the same number of rows: 216
The 'Dialect_Word' column matches exactly in both files.


In [30]:
import pandas as pd

# Define the file paths
dialect = "che"
basic_file = f"/Users/bryan/Desktop/wkdir/Dialect/multimodal-dialectal-bias/data/text/basic/{dialect}.csv"
complex_file = f"/Users/bryan/Desktop/wkdir/Dialect/multimodal-dialectal-bias/data/text/complex/{dialect}.csv"

# Read the CSV files into DataFrames
basic_df = pd.read_csv(basic_file)
complex_df = pd.read_csv(complex_file)

# Replace the 'polysemic' column in the complex file with the one from the basic file
complex_df['polysemic'] = basic_df['polysemic']

# Overwrite the complex file with the updated DataFrame
complex_df.to_csv(complex_file, index=False)

# Optionally, print the first few rows for verification
print(complex_df.head())


  Dialect_Word SAE_Word                                     Dialect_Prompt  \
0       torque    truck  a large black torque driving on a neighborhood...   
1       torque    truck           a woman driving a green torque on a farm   
2       torque    truck                    a torque with a dog in the back   
3       torque    truck              a man driving a torque on the highway   
4       torque    truck           two kids sitting on the back of a torque   

                                          SAE_Prompt  person_in_prompt  \
0  a large black truck driving on a neighborhood ...                 0   
1            a woman driving a green truck on a farm                 1   
2                     a truck with a dog in the back                 0   
3               a man driving a truck on the highway                 1   
4            two kids sitting on the back of a truck                 1   

   dialect_word_count  polysemic  Polysemy_Prompt  
0                   1          1  