# Undertanding dataset

- BCS
- FT
- LuxemBERT
- NC_lux

import libraries

In [33]:
import json
import pandas as pd

from utils.utils_nlp import split_paragraph

## NC_LUX

In [40]:
data = []

## NC_lux - Luxembourgish Only Text With Category Labels

with open(
    "/Users/lujun.li/projects/mt_luxembourgish/data/NC_lux/train.json",
    "r",
    encoding="utf-8",
) as file:
    for line in file:
        json_data = json.loads(line)
        data.append(json_data)


NC_lux_df = pd.DataFrame(data)
NC_lux_df["length"] = NC_lux_df["text"].apply(len)
NC_lux_df[["sentences", "sentences_count"]] = NC_lux_df["text"].apply(
    lambda x: pd.Series(split_paragraph(x))
)

NC_lux_df = NC_lux_df[NC_lux_df["sentences_count"] != 0]

NC_lux_df.describe()

Unnamed: 0,length,sentences_count
count,7055.0,7055.0
mean,1798.263359,12.881502
std,1646.203736,10.779453
min,12.0,1.0
25%,760.5,6.0
50%,1359.0,10.0
75%,2263.0,16.0
max,21138.0,171.0


## LuxemBERT

In [62]:
# L_RTE - Luxembourgish data with a negation labels

L_RTE_df = pd.read_csv(
    "/Users/lujun.li/projects/mt_luxembourgish/data/LuxemBERT/L-RTE/train.tsv",
    sep="\t",
    index_col="index",
).reset_index()
L_RTE_df.describe()

## LLama 70B Result filtering


In [1]:
import pandas as pd


df = pd.read_csv(
    "/home/lujun_li/projects/mt_luxembourgish/data/fake_targets/translation_llama3_70b_complete.csv"
)

In [2]:
df["translated_text"] = df["translated_text"].str.replace(r"\n", "", regex=True)

In [3]:
import fasttext

model_path = "/home/lujun_li/projects/mt_luxembourgish/models/lang_dect/lid218e.bin"
model = fasttext.load_model(model_path)


def detect_language_from_row(row, k=1):
    """
    Detect the language from a DataFrame row.

    Args:
        row (pd.Series): A row from a DataFrame containing a column 'text'.
        k (int): Number of top language predictions to consider.

    Returns:
        tuple: (list of top k languages with probabilities, bool indicating if Luxembourgish was detected)
    """
    text = row["translated_text"]  # Extract the text from the DataFrame row

    # Use the model to predict the language
    labels, probs = model.predict(text, k=k)

    # Prepare a list of the top k languages and their probabilities
    results = []
    is_english = False

    for label, prob in zip(labels, probs):
        # Clean the label by removing the prefix '__label__'
        lang = label.replace("__label__", "")

        # Check if Luxembourgish is in the top k results
        if "eng_Latn" in lang and probs > 0.9:
            is_english = True

        results.append((lang, prob))

    # Return the top k results and whether Luxembourgish was detected
    return results, is_english


def detect_quality_issues(row):
    # 1. 检查是否存在“translated_text”或者“----” 3068
    if "translated_text" in row["translated_text"] or "----" in row["translated_text"]:
        return 1

    # 2. 检查段落是否有换行 352
    if "\n" in row["translated_text"]:
        return 2

    # 3. 检查输出比输入短一半 459
    source_length = len(row["subsentence"].replace(" ", ""))
    target_length = len(row["translated_text"].replace(" ", ""))
    if target_length < 0.5 * source_length:
        return 3

    # 4. 检查输出比输入长 1.5 倍以上 1154
    if target_length > 1.5 * source_length:
        return 4

    # 5. 检查 source_text 是否包含类似 HTML 标签
    if "<div" in row["subsentence"] and "</div>" in row["subsentence"]:
        return 5

    # 6. 检查 target_text 是否包含类似 HTML 标签
    if "<div" in row["translated_text"] and "</div>" in row["translated_text"]:
        return 6
    return 0


# 应用检测规则
df["quality_issues"] = df.apply(detect_quality_issues, axis=1)

df["language_detection"] = df.apply(detect_language_from_row, axis=1)

# Split the results into two new columns
df["lang_detection_translated"], df["is_english_translated"] = zip(
    *df["language_detection"]
)

## Choose top 2 languages detected and if one languange detected is luxembourgish, then is_luxembourgish is True

In [4]:
# Initial dataset size
print("Initial dataset size:", df.shape)

# Filter by 'is_english_translated'
df = df[df["is_english_translated"] == True]
print("Size after filtering by 'is_english_translated':", df.shape)

# Filter by 'quality_issues' != 1
df = df[df["quality_issues"] != 1]
print("Size after filtering 'quality_issues' != 1:", df.shape)

# Filter by 'quality_issues' != 5
df = df[df["quality_issues"] != 5]
print("Size after filtering 'quality_issues' != 5:", df.shape)

# Filter by 'quality_issues' != 6
df = df[df["quality_issues"] != 6]
print("Size after filtering 'quality_issues' != 6:", df.shape)

# Filter by 'quality_issues' != 4
df = df[df["quality_issues"] != 4]
print("Size after filtering 'quality_issues' != 4:", df.shape)

Initial dataset size: (135890, 14)
Size after filtering by 'is_english_translated': (130532, 14)
Size after filtering 'quality_issues' != 1: (129953, 14)
Size after filtering 'quality_issues' != 5: (128726, 14)
Size after filtering 'quality_issues' != 6: (128726, 14)
Size after filtering 'quality_issues' != 4: (128608, 14)


In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

# Convert DataFrame to Arrow Table
arrow_table = pa.Table.from_pandas(df)

# Save the Arrow Table to a .arrow file
arrow_file_path = (
    "/home/lujun_li/projects/mt_luxembourgish/data/fake_targets/filtered_data.arrow"
)
pq.write_table(arrow_table, arrow_file_path)

print(f"Data saved to {arrow_file_path}")