<a href="https://colab.research.google.com/github/BrovkoD/unlp2025-data-gen/blob/master/cross_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install deep_translator



In [None]:
import pandas as pd
# from deep_translator import GoogleTranslator, DeeplTranslator
from difflib import SequenceMatcher
import ast
import numpy as np
import time
import random

In [None]:
df = pd.read_parquet("/content/train.parquet")

In [None]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]"
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]"
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]"


In [None]:
def translate_text(text, src_lang, target_lang, max_retries=2, base_delay=1.5):
    """Translate text with retry logic on failure."""
    for attempt in range(max_retries):
        try:
            return GoogleTranslator(source=src_lang, target=target_lang).translate(text)
        except Exception as e:
            wait_time = base_delay * (2 ** attempt) + random.uniform(0, 0.5)
            print(f"[Retry {attempt+1}] Translation failed: {e}. Retrying in {wait_time:.1f}s...")
            time.sleep(wait_time)

    # Final fallback after retries
    print("[!] Translation failed after all retries.")
    return None

def map_trigger_words(original_text, translated_text, original_trigger_words, src_lang, target_lang):
    if original_trigger_words is None:
        return None

    trigger_list = original_trigger_words

    mapped_indices = []
    for start, end in trigger_list:
        phrase = original_text[start:end]
        try:
            translated_phrase = GoogleTranslator(source=src_lang, target=target_lang).translate(phrase)
            match_start = translated_text.find(translated_phrase)
            if match_start != -1:
                match_end = match_start + len(translated_phrase)
                mapped_indices.append([match_start, match_end])
        except:
            continue  # Skip if translation fails

    return mapped_indices if mapped_indices else None

In [None]:
# Duplicate dataset with translations
new_rows = []
unprocessed_rows = []

In [None]:
from itertools import islice

skip_count = len(new_rows) + len(unprocessed_rows)  # number of already translated records

for _, row in islice(df.iterrows(), skip_count, None):
    src_lang = row["lang"]
    target_lang = "ru" if src_lang == "uk" else "uk"

    # Translate text
    translated_text = translate_text(row["content"], src_lang, target_lang)
    if translated_text is None:
      unprocessed_rows.append(row)
      continue

    # Map trigger words
    new_trigger_words = map_trigger_words(row["content"], translated_text, row["trigger_words"], src_lang, target_lang)

    # Create new row with translated data
    new_row = row.copy()
    new_row["content"] = translated_text
    new_row["lang"] = target_lang
    new_row["trigger_words"] = new_trigger_words

    new_rows.append(new_row)

In [None]:
pd.DataFrame(new_rows).to_csv("new_data.csv", index=False)
pd.DataFrame(unprocessed_rows).to_csv("unp_data.csv", index=False)

In [None]:
new_df = pd.DataFrame(new_rows)

In [None]:
new_df_cleaned = new_df[(new_df["manipulative"] == False) | ((new_df["manipulative"] == True) & (new_df["trigger_words"].notnull()))]

In [None]:
len(new_df) - len(new_df_cleaned)

751

In [None]:
# Append translated rows to original DataFrame
df_augmented_clean = pd.concat([df, new_df_cleaned], ignore_index=True)

# Save the new dataset
df_augmented_clean.to_csv("trnslt_span_v1.csv", index=False)

print("Dataset successfully doubled with translations!")

Dataset successfully doubled with translations!


In [None]:
not_full_df = new_df[((new_df["manipulative"] == True) & (new_df["trigger_words"].isnull()))]

In [None]:
print(not_full_df)

                                        id  \
1     7159f802-6f99-4e9d-97bd-6f565a4a0fae   
2     e6a427f1-211f-405f-bd8b-70798458d656   
4     9c01de00-841f-4b50-9407-104e9ffb03bf   
8     ed5d2195-09b4-4837-82eb-b65244c8a7b2   
13    62d8357a-6f5f-4ee6-9469-650cf09848a6   
...                                    ...   
3799  5bd38311-80de-4e41-95ac-521f848ce43d   
3810  2494f496-0070-49bd-980d-98bc048b0be4   
3811  0e89894c-c5c1-42af-a40c-b9bb50d9ccda   
3814  97641255-a41b-42ed-8701-2db2a451f5e0   
3821  d7700072-24d9-443c-8bdb-b5cdd5530d86   

                                                content lang  manipulative  \
1     Нещодавно 95 -й квартал вразив російських Рамс...   uk          True   
2     🤩\nМежду тем, существует эвакуация автобусной ...   ru          True   
4     Розрахунки 122-мм Сау 2C1 "гвоздики" 132-ї бри...   uk          True   
8     ⚡\nПісля звільнення Соледара сприятливі перспе...   uk          True   
13    ​​\nРуслан Стрелец: После 10 лет задержки Укра...  

In [None]:
not_full_df = not_full_df.drop(columns=["trigger_words"], errors="ignore")  # remove broken/missing column if needed

not_full_df = not_full_df.merge(
    df[["id", "trigger_words"]],
    on="id",
    how="left"
)

In [None]:
not_full_df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words
0,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Нещодавно 95 -й квартал вразив російських Рамс...,uk,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]"
1,e6a427f1-211f-405f-bd8b-70798458d656,"🤩\nМежду тем, существует эвакуация автобусной ...",ru,True,"[loaded_language, euphoria]","[[55, 100]]"
2,9c01de00-841f-4b50-9407-104e9ffb03bf,"Розрахунки 122-мм Сау 2C1 ""гвоздики"" 132-ї бри...",uk,True,[loaded_language],"[[114, 144]]"
3,ed5d2195-09b4-4837-82eb-b65244c8a7b2,⚡\nПісля звільнення Соледара сприятливі перспе...,uk,True,"[cherry_picking, cliche]","[[3, 66], [71, 94]]"
4,62d8357a-6f5f-4ee6-9469-650cf09848a6,​​\nРуслан Стрелец: После 10 лет задержки Укра...,ru,True,"[loaded_language, glittering_generalities, cli...","[[35, 44], [54, 74], [538, 575], [1338, 1502],..."


In [None]:
# Append translated rows to original DataFrame
df_augmented_all = pd.concat([df, new_df_cleaned, not_full_df], ignore_index=True)

# Save the new dataset
df_augmented_all.to_csv("trnslt_span_v2.csv", index=False)

print("Dataset successfully doubled with translations!")

Dataset successfully doubled with translations!


In [None]:
df_augmented_all.to_csv("trnslt_classification_v1.csv", index=False)

NameError: name 'df_augmented_all' is not defined

In [None]:
df = pd.read_csv("/content/trnslt_classification_v1.csv")

In [None]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,['euphoria' 'loaded_language'],"[array([27, 63]) array([65, 88]) array([ 90, 1..."
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,['loaded_language' 'cherry_picking'],"[array([ 0, 40]) array([123, 137]) array([180,..."
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,['loaded_language' 'euphoria'],"[array([ 55, 100])]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,['loaded_language'],"[array([114, 144])]"


In [None]:
import pandas as pd
import numpy as np
import ast

def clean(val):
    if pd.isna(val):
      return None
    return val

df = df[df["id"] != "ea6caa54-23d3-40c0-a8aa-a1a0e593329e"]
df["techniques"] = df["techniques"].apply(clean)
df["trigger_words"] = df["trigger_words"].apply(clean)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["techniques"] = df["techniques"].apply(clean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["trigger_words"] = df["trigger_words"].apply(clean)


In [None]:
import re

def extract_arrays(text):
    if pd.isna(text):
        return None
    if not isinstance(text, str):
        return text  # already a list

    try:
        # Find all patterns like [27, 63]
        matches = re.findall(r"array\(\[\s*(-?\d+)\s*,\s*(-?\d+)\s*\]\)", text)
        return [[int(start), int(end)] for start, end in matches]
    except:
        return None

df["trigger_words"] = df["trigger_words"].apply(extract_arrays)

In [None]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,['euphoria' 'loaded_language'],"[[27, 63], [65, 88], [90, 183], [186, 308]]"
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,['loaded_language' 'cherry_picking'],"[[0, 40], [123, 137], [180, 251], [253, 274]]"
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,['loaded_language' 'euphoria'],"[[55, 100]]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,['loaded_language'],"[[114, 144]]"


In [None]:
df.to_parquet("trnslt_classification_v1.parquet", index=False)

In [None]:
df = pd.read_parquet("/content/trnslt_classification_v1.parquet")
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,['euphoria' 'loaded_language'],"[[27, 63], [65, 88], [90, 183], [186, 308]]"
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,['loaded_language' 'cherry_picking'],"[[0, 40], [123, 137], [180, 251], [253, 274]]"
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,['loaded_language' 'euphoria'],"[[55, 100]]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,['loaded_language'],"[[114, 144]]"


In [None]:
na_content_rows = df[df["content"].isna()]
print(na_content_rows)

Empty DataFrame
Columns: [id, content, lang, manipulative, techniques, trigger_words]
Index: []
