In [None]:
import re
import pandas as pd
from tqdm import tqdm

def parse_m2_file(m2_path):
    """
    Parses an M2 file and returns a list of (source, target) sentence pairs.
    Each block in M2 corresponds to one sentence and its edits.
    """
    with open(m2_path, 'r', encoding='utf-8') as f:
        blocks = f.read().strip().split("\n\n")

    pairs = []
    for block in tqdm(blocks, desc=f"Parsing {m2_path}"):
        lines = block.strip().split("\n")
        if not lines or not lines[0].startswith("S "):
            continue
        original = lines[0][2:].strip().split()
        edits = [line for line in lines[1:] if line.startswith("A ") and "|||" in line]

        # No edits — skip or treat as identity
        if not edits:
            corrected = original
        else:
            corrected = original[:]
            offset = 0  # Track position shift from edits
            for edit in edits:
                parts = edit.split("|||")
                span = list(map(int, parts[0][2:].split()))
                correction = parts[2].strip()
                if correction == "-NONE-":
                    continue
                start, end = span[0], span[1]
                corrected[start + offset:end + offset] = correction.split()
                offset += len(correction.split()) - (end - start)

        # Add prefix "gec: " to input for T5
        source = " ".join(original)
        target = " ".join(corrected)
        pairs.append(("gec: " + source, target))

    return pairs

In [None]:
train_pairs_wi = parse_m2_file("wi+locness/m2/ABC.train.gold.bea19.m2")
val_pairs_wi = parse_m2_file("wi+locness/m2/ABCN.dev.gold.bea19.m2")

df_train_wi = pd.DataFrame(train_pairs_wi, columns=["input_text", "target_text"])
df_val_wi = pd.DataFrame(val_pairs_wi, columns=["input_text", "target_text"])


Parsing wi+locness/m2/ABC.train.gold.bea19.m2: 100%|██████████| 34308/34308 [00:00<00:00, 117804.29it/s]
Parsing wi+locness/m2/ABCN.dev.gold.bea19.m2: 100%|██████████| 4384/4384 [00:00<00:00, 106872.43it/s]


In [None]:
import csv
df_train_wi.to_csv("t5_train_wi.csv", index=False, quoting = csv.QUOTE_ALL)
df_val_wi.to_csv("t5_val_wi.csv", index=False, quoting = csv.QUOTE_ALL)

In [None]:
train_pairs_fce = parse_m2_file("fce/m2/fce.train.gold.bea19.m2")
val_pairs_fce = parse_m2_file("fce/m2/fce.dev.gold.bea19.m2")


df_train_fce= pd.DataFrame(train_pairs_fce, columns=["input_text","target_text"])
df_val_fce= pd.DataFrame(val_pairs_fce, columns=["input_text","target_text"])

Parsing fce/m2/fce.train.gold.bea19.m2: 100%|██████████| 28350/28350 [00:00<00:00, 153812.60it/s]
Parsing fce/m2/fce.dev.gold.bea19.m2: 100%|██████████| 2191/2191 [00:00<00:00, 150901.00it/s]


In [None]:
df_train_fce.to_csv("t5_train_fce.csv", index=False, quoting = csv.QUOTE_ALL)
df_val_fce.to_csv("t5_val_fce.csv", index=False, quoting = csv.QUOTE_ALL)

In [None]:
import pandas as pd

# Load both training CSVs
df1 = pd.read_csv("t5_train.csv")
df2 = pd.read_csv("t5_train_fce.csv", skiprows=1)  # Skip header of second file

# Concatenate them
df_train_merged = pd.concat([df1, df2], ignore_index=True)

# Optional: Drop duplicates
df_train_merged = df_train_merged.drop_duplicates()

# Save merged
df_train_merged.to_csv("t5_train_merged.csv", index=False)

In [None]:
df_val1 = pd.read_csv("t5_val.csv")
df_val2 = pd.read_csv("t5_val_fce.csv", skiprows=1)

df_val_merged = pd.concat([df_val1, df_val2], ignore_index=True)
df_val_merged = df_val_merged.drop_duplicates()

df_val_merged.to_csv("t5_val_merged.csv", index=False)

In [None]:
df = pd.read_csv("t5_train_merged.csv") # our code was wrong above bc we accidentally stacked the csvs horizontally
print(df.columns)

In [None]:
df_wl = pd.read_csv("t5_train.csv")  # Should have proper headers
df_fce = pd.read_csv("t5_train_fce.csv", skiprows=1, names=["input_text", "target_text"])

In [None]:
df_merged = pd.concat([df_wl, df_fce], ignore_index=True)
df_merged = df_merged.dropna()
df_merged = df_merged.drop_duplicates()

# Sanity check
print(df_merged.columns)
print(df_merged.head())

In [None]:
print("Total rows:", len(df_merged))
print("Any nulls?", df_merged.isnull().sum())
print("Any duplicates?", df_merged.duplicated().sum())

In [None]:
df_merged.to_csv("t5_train_merged.csv", index=False)

In [None]:
df_wl_val = pd.read_csv("t5_val.csv")  # Should have proper headers
df_fce_val = pd.read_csv("t5_val_fce.csv", skiprows=1, names=["input_text", "target_text"])

In [None]:
df_merged = pd.concat([df_wl_val, df_fce_val], ignore_index=True)
df_merged = df_merged.dropna()
df_merged = df_merged.drop_duplicates()

# Sanity check
print(df_merged.columns)
print(df_merged.head())

In [None]:
print("Total rows:", len(df_merged))
print("Any nulls?", df_merged.isnull().sum())
print("Any duplicates?", df_merged.duplicated().sum())

In [None]:
df_merged.to_csv("t5_val_merged.csv", index=False)

In [None]:
df_wl_len = pd.read_csv("t5_train_fce.csv")

In [None]:
print(len(df_wl_len))
df_wl_len.drop_duplicates().dropna()
print(len(df_wl_len))
print(df_wl_len.tail(10))