In [8]:
import os
import shutil
import re
import unicodedata
import pandas as pd

data_dir = "../Dataset/processed"
paths = {
    "train": os.path.join(data_dir, "train.csv"),
    "val":   os.path.join(data_dir, "val.csv"),
    "test":  os.path.join(data_dir, "test.csv"),
}

def normalize_text(s):
    if pd.isna(s):
        return ""
    s = str(s).strip()
    s = unicodedata.normalize("NFC", s)
    s = s.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    s = re.sub(r'^["\']+|["\']+$', '', s)  
    return s

for name, p in paths.items():
    if not os.path.exists(p):
        print(f"skip (not found): {name} -> {p}")
        continue

    bak = p + ".bak"
    if not os.path.exists(bak):
        shutil.copy2(p, bak)

    df = pd.read_csv(p)
    before = len(df)

    df["english"] = df.get("english", "").apply(normalize_text)
    df["igbo"]    = df.get("igbo", "").apply(normalize_text)

    df = df[(df["english"] != "") & (df["igbo"] != "")]
    df = df.drop_duplicates()

    after = len(df)
    df.to_csv(p, index=False)
    print(f"{name}: {before} -> {after} (removed {before-after})  saved:{p}  backup:{bak}")


train: 8632 -> 8309 (removed 323)  saved:../Dataset/processed/train.csv  backup:../Dataset/processed/train.csv.bak
val: 1079 -> 1068 (removed 11)  saved:../Dataset/processed/val.csv  backup:../Dataset/processed/val.csv.bak
test: 1079 -> 1069 (removed 10)  saved:../Dataset/processed/test.csv  backup:../Dataset/processed/test.csv.bak


In [9]:
import pandas as pd
pd.read_csv("../Dataset/processed/train.csv").head()


Unnamed: 0,english,igbo
0,Why did you leave your former place of work?,Gịnị mere i ji hapụ ebe ị na-arụ n'oge mbu?
1,"Eventually, it took me about two more calls to...","N'ikpeazụ, ọ were m ihe ugboro abụọ ọzọ iji me..."
2,News have previously read that governor Okoroc...,Akwụkwọ akụkọ ekwubuola na Gọvanọ Okorocha sị ...
3,"Soyinka: 'Sowore, Welcome To The Club' As Nobe...","Soyinka: 'Sowore, Nnọọ n'Otu' dịka Onye Oke ih..."
4,"according to an interview, some people express...",Ụfọdụ ndị anyị gbara ajụjụ ọnụ gosiri obi ụtọ ...


In [10]:
import pandas as pd
df = pd.read_csv("../Dataset/processed/train.csv")

# How many rows have leading or trailing straight double-quotes in the loaded strings?
bad = df['english'].str.startswith('"') | df['english'].str.endswith('"')
print("english rows with surrounding quotes:", bad.sum())

bad2 = df['igbo'].str.startswith('"') | df['igbo'].str.endswith('"')
print("igbo rows with surrounding quotes:", bad2.sum())


english rows with surrounding quotes: 0
igbo rows with surrounding quotes: 0
