In [29]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter('ignore')

# **Arabic Dataset**

In [30]:
def readJSON(path):
    df = pd.DataFrame()
    with open(path, 'r') as file:
        for line in file:
            line_df = pd.read_json(line, lines=True)
            df = pd.concat([df, line_df], ignore_index=True)
    return df

train = readJSON('/content/drive/MyDrive/SharedTask/araieval24_task1_train.jsonl')
dev = readJSON('/content/drive/MyDrive/SharedTask/araieval24_task1_dev.jsonl')

In [31]:
train

Unnamed: 0,id,text,labels,type
0,7365,تحذيرات من حرب جديدة في حال فشل الانتخابات الق...,"[{'start': 0, 'end': 50, 'technique': 'Appeal_...",tweet
1,1400135121001488384,رب اجعل صباحي هذا غائم بلطفك، لا شر فيه ولا وق...,[],tweet
2,1175652922845216768,#عين_اليمن #26سبتمبر ثورة شعب صنعت تاريخة احتف...,"[{'start': 78, 'end': 89, 'technique': 'Name_C...",tweet
3,1270585163752316928,الحرب على #سورية | حملات تظليل مستمرة .. لمحاو...,"[{'start': 25, 'end': 30, 'technique': 'Loaded...",tweet
4,1395508740703535104,"📣 New Podcast! ""مزيج الخبرة والشباب.. خلطة سحر...","[{'start': 39, 'end': 50, 'technique': 'Name_C...",tweet
...,...,...,...,...
6992,MIS_166-curl_03_003,ووفق ما ذكرت مجلة “ناشيونال إنترست”، فإن التكه...,[{'technique': 'Obfuscation-Vagueness-Confusio...,paragraph
6993,MIS_2043-eurl_04_002,وقال المحامي حسين أبو مرار لـ”الغد”، ان البلاغ...,"[{'technique': 'Causal_Oversimplification', 't...",paragraph
6994,CeLoDzv2hYnjjoMNehJFNo_001,شهد مركز سانتك للمعارض والمؤتمرات في سنغافورة،...,[],paragraph
6995,aYrvVSACG3PpdKaQBiW9ij_001,جــدول مواعيد إقلاع و وصول الرحلات الدولية وال...,[],paragraph


# For arabic train dataset

In [63]:
# Load your dataset (modify the filename accordingly)
df = train.copy()

# Function to extract all techniques from the 'labels' column
def extract_techniques(label_list):
    if label_list == "[]":  # If labels are empty, return an empty list
        return []
    try:
        # Convert string to list (if stored as a string)
        label_data = eval(label_list) if isinstance(label_list, str) else label_list
        return list(set(item['technique'] for item in label_data))
    except:
        return []

# Creating a new DataFrame
new_df = pd.DataFrame({
    'content': df['text'],
    'techniques': df['labels'].apply(extract_techniques)
})


# Display the first few rows
print(new_df.head())

                                             content  \
0  تحذيرات من حرب جديدة في حال فشل الانتخابات الق...   
1  رب اجعل صباحي هذا غائم بلطفك، لا شر فيه ولا وق...   
2  #عين_اليمن #26سبتمبر ثورة شعب صنعت تاريخة احتف...   
3  الحرب على #سورية | حملات تظليل مستمرة .. لمحاو...   
4  📣 New Podcast! "مزيج الخبرة والشباب.. خلطة سحر...   

                                    techniques  
0  [Loaded_Language, Appeal_to_Fear-Prejudice]  
1                                           []  
2     [Loaded_Language, Name_Calling-Labeling]  
3                            [Loaded_Language]  
4                      [Name_Calling-Labeling]  


In [64]:
import pandas as pd

rename_map = {
    'Loaded_Language': 'loaded_language',
    'Doubt': 'fud',
    'Appeal_to_Fear-Prejudice': 'appeal_to_fear',
    'Whataboutism': 'whataboutism',
    'Appeal_to_Popularity': 'bandwagon',
    'Straw_Man': 'straw_man',
    'Conversation_Killer': 'cliche'
}

# Function to rename techniques in a list
def rename_techniques(techniques):
    return [rename_map.get(tech, tech) for tech in techniques]  # Rename if exists, else keep original

# Apply the renaming function
new_df['techniques'] = new_df['techniques'].apply(rename_techniques)

In [65]:
# List of valid techniques
valid_techniques = ['loaded_language', 'cherry_picking', 'glittering_generalities',
                    'cliche', 'euphoria', 'fud', 'appeal_to_fear', 'whataboutism',
                    'bandwagon', 'straw_man']

# Filter rows where all techniques are in the valid list
new_df = new_df[new_df['techniques'].apply(lambda x: all(tech in valid_techniques for tech in x))]


In [66]:
# Convert string representation of lists to actual lists
import ast

# Apply replacement for empty lists
new_df['techniques'] = new_df['techniques'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
new_df['techniques'] = new_df['techniques'].apply(lambda x: ['no_technique'] if not x else x)


In [68]:
# Convert the list to string format (e.g., "['technique1', 'technique2']")
new_df['techniques'] = new_df['techniques'].apply(lambda x: str(x) if isinstance(x, list) else x)

# Check the result to ensure the format is correct
print(new_df.head())


                                              content  \
0   تحذيرات من حرب جديدة في حال فشل الانتخابات الق...   
1   رب اجعل صباحي هذا غائم بلطفك، لا شر فيه ولا وق...   
3   الحرب على #سورية | حملات تظليل مستمرة .. لمحاو...   
5   حال الملعب الجماعي يقلق نشطاء بأولاد النمة LIN...   
10  الفنان فلاح هاشم: امنيتي انتقال البلد لحالة ال...   

                               techniques  
0   ['loaded_language', 'appeal_to_fear']  
1                        ['no_technique']  
3                     ['loaded_language']  
5                        ['no_technique']  
10                       ['no_technique']  


In [69]:
from collections import Counter
# List of techniques to count
techniques_to_count = [
    'loaded_language', 'cherry_picking', 'glittering_generalities',
    'cliche', 'euphoria', 'fud', 'appeal_to_fear', 'whataboutism',
    'bandwagon', 'straw_man','no_technique'
]

# Function to split and count techniques
def count_individual_techniques(series):
    all_techniques = series.str.strip('[]').str.replace("'", "").str.split(', ')
    return Counter(technique for sublist in all_techniques for technique in sublist if technique)

# Get the counts
individual_counts = count_individual_techniques(new_df['techniques'])

# Display the counts for each technique in the list
for technique in techniques_to_count:
    print(f"Total '{technique}':", individual_counts[technique])

Total 'loaded_language': 1417
Total 'cherry_picking': 0
Total 'glittering_generalities': 0
Total 'cliche': 18
Total 'euphoria': 0
Total 'fud': 93
Total 'appeal_to_fear': 53
Total 'whataboutism': 12
Total 'bandwagon': 4
Total 'straw_man': 9
Total 'no_technique': 2391


In [71]:
# Save the updated dataset
new_df.to_csv("arabic_train_dataset.csv", index=False)

# For arabic dev dataset

In [48]:
# Load your dataset (modify the filename accordingly)
df = dev.copy()

# Function to extract all techniques from the 'labels' column
def extract_techniques(label_list):
    if label_list == "[]":  # If labels are empty, return an empty list
        return []
    try:
        # Convert string to list (if stored as a string)
        label_data = eval(label_list) if isinstance(label_list, str) else label_list
        return list(set(item['technique'] for item in label_data))
    except:
        return []

# Creating a new DataFrame
new_df = pd.DataFrame({
    'content': df['text'],
    'techniques': df['labels'].apply(extract_techniques)
})


# Display the first few rows
print(new_df.head())

                                             content  \
0          هل هذا يعني ان السعوديه تاوي ارهابي؟ LINK   
1  المحلل السياسي ماجد الخطيب: المتشددون في #إيرا...   
2  التحــام من النــهر إلى البحــر.. جميع مقاهي ج...   
3  موسكو: البنتاغون تجاوز الحد المسموح به من الأس...   
4  ▶️ طهران تتهم تل أبيب بالوقوف وراء الهجوم ضد م...   

                                          techniques  
0                     [Doubt, Name_Calling-Labeling]  
1           [Loaded_Language, Name_Calling-Labeling]  
2           [Loaded_Language, Name_Calling-Labeling]  
3  [Appeal_to_Authority, Questioning_the_Reputation]  
4  [Loaded_Language, False_Dilemma-No_Choice, Que...  


In [49]:
import pandas as pd

rename_map = {
    'Loaded_Language': 'loaded_language',
    'Doubt': 'fud',
    'Appeal_to_Fear-Prejudice': 'appeal_to_fear',
    'Whataboutism': 'whataboutism',
    'Appeal_to_Popularity': 'bandwagon',
    'Straw_Man': 'straw_man',
    'Conversation_Killer': 'cliche'
}

# Function to rename techniques in a list
def rename_techniques(techniques):
    return [rename_map.get(tech, tech) for tech in techniques]  # Rename if exists, else keep original

# Apply the renaming function
new_df['techniques'] = new_df['techniques'].apply(rename_techniques)

In [50]:
# List of valid techniques
valid_techniques = ['loaded_language', 'cherry_picking', 'glittering_generalities',
                    'cliche', 'euphoria', 'fud', 'appeal_to_fear', 'whataboutism',
                    'bandwagon', 'straw_man']

# Filter rows where all techniques are in the valid list
new_df = new_df[new_df['techniques'].apply(lambda x: all(tech in valid_techniques for tech in x))]


In [51]:
# Convert string representation of lists to actual lists
import ast

# Apply replacement for empty lists
new_df['techniques'] = new_df['techniques'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
new_df['techniques'] = new_df['techniques'].apply(lambda x: ['no_technique'] if not x else x)

In [53]:
# Convert the list to string format (e.g., "['technique1', 'technique2']")
new_df['techniques'] = new_df['techniques'].apply(lambda x: str(x) if isinstance(x, list) else x)

In [55]:
from collections import Counter
# List of techniques to count
techniques_to_count = [
    'loaded_language', 'cherry_picking', 'glittering_generalities',
    'cliche', 'euphoria', 'fud', 'appeal_to_fear', 'whataboutism',
    'bandwagon', 'straw_man','no_technique'
]

# Function to split and count techniques
def count_individual_techniques(series):
    all_techniques = series.str.strip('[]').str.replace("'", "").str.split(', ')
    return Counter(technique for sublist in all_techniques for technique in sublist if technique)

# Get the counts
individual_counts = count_individual_techniques(new_df['techniques'])

# Display the counts for each technique in the list
for technique in techniques_to_count:
    print(f"Total '{technique}':", individual_counts[technique])

Total 'loaded_language': 183
Total 'cherry_picking': 0
Total 'glittering_generalities': 0
Total 'cliche': 2
Total 'euphoria': 0
Total 'fud': 9
Total 'appeal_to_fear': 11
Total 'whataboutism': 2
Total 'bandwagon': 1
Total 'straw_man': 0
Total 'no_technique': 279


In [56]:
# Save the updated dataset
new_df.to_csv("arabic_dev_dataset.csv", index=False)

# **English Dataset**

In [17]:
df1=pd.read_csv('/content/sentence_fallacy_data_normed.csv')

In [18]:
df1

Unnamed: 0.1,Unnamed: 0,source_url,test_name,grade,Logical Fallacy Types,sentence,canocalized_fallacy_types,fallacy_types
0,4506,https://quizizz.com/admin/quiz/5e5820781785c80...,Fallacies- 8th grade,8th grade,It tells you to like something because a lot o...,"""Everyone should like coffee: 95% of teachers ...",ad populum,it tells you to like something because a lot o...
1,45498,https://quizizz.com/admin/quiz/5dba05d87438e20...,Fallacy Practice,10th - 12th grade,The Bandwagon Fallacy,"the phrase ""Ninety percent of all people surve...",ad populum,bandwagon
2,1440,https://quizizz.com/admin/quiz/5f9607e2156afc0...,Logical Fallacies,11th grade,e) Bandwagon,I guess I should buy my 12-year-old daughter a...,ad populum,bandwagon
3,2893,https://quizizz.com/admin/quiz/5c58d3215714520...,Logical Fallacies,9th - University,Bandwagon Argument,I know the professor said the Bridges of Madis...,ad populum,bandwagon
4,2831,https://quizizz.com/admin/quiz/5f948dcbedafcd0...,Logical Fallacies,10th grade,bandwagon,"""Because everybody thinks this way, it must be...",ad populum,bandwagon
...,...,...,...,...,...,...,...,...
1664,44920,https://quizizz.com/admin/quiz/5e46e2e7e55b370...,Logic Guide Quiz,10th - 12th grade,Affirming a Disjunct,"P or Q.\nP.\nTherefore, not Q.\nP or Q.",affirming a disjunct,affirming a disjunct
1665,46423,https://quizizz.com/admin/quiz/5e5ec9b75911040...,Logic Guide Pt. 2,12th grade,Faulty Comparison,Broccoli has significantly less fat than the l...,inconsistent comparison,faulty comparison
1666,46429,https://quizizz.com/admin/quiz/5e5ec9b75911040...,Logic Guide Pt. 2,12th grade,False Attribution,I had this book that proved that leprechauns a...,false attribution,false attribution
1667,46430,https://quizizz.com/admin/quiz/5e5ec9b75911040...,Logic Guide Pt. 2,12th grade,Moving the Goalposts,"Issue A has been raised, and adequately answer...",moving the goalposts,moving the goalposts


In [19]:
import pandas as pd

rename_map = {
        'loaded language': 'loaded_language',
        'cherry picking': 'cherry_picking',
        'glittering generalities': 'glittering_generalities',
        'cliche': 'cliche',
        'euphoria': 'euphoria',
        'fud': 'fud',
        'appeal to fear': 'appeal_to_fear',
        'whataboutism': 'whataboutism',
        'bandwagon': 'bandwagon',
        'band wagon': 'bandwagon',
        'straw man': 'straw_man',
        'strawman': 'straw_man',
        'straw man; strawman': 'straw_man',
        'stawman': 'straw_man'
    }
    # Apply the renaming to the 'fallacy_types' column
df1['fallacy_types'] = df1['fallacy_types'].map(rename_map).fillna(df1['fallacy_types'])

# Filter the rows based on the desired fallacy types
techniques_to_keep = ['loaded_language', 'cherry_picking', 'glittering_generalities',
                      'cliche', 'euphoria', 'fud', 'appeal_to_fear', 'whataboutism',
                      'bandwagon', 'straw_man']

df1_filtered = df1[df1['fallacy_types'].isin(techniques_to_keep)]

# Select only the 'sentence' and 'fallacy_types' columns
df_filtered_columns = df1_filtered[['sentence', 'fallacy_types']]

# Display the filtered DataFrame
print(df_filtered_columns.head())


                                            sentence fallacy_types
1  the phrase "Ninety percent of all people surve...     bandwagon
2  I guess I should buy my 12-year-old daughter a...     bandwagon
3  I know the professor said the Bridges of Madis...     bandwagon
4  "Because everybody thinks this way, it must be...     bandwagon
5  Everyone seems to support the changes in the v...     bandwagon


In [20]:
df_filtered_columns['fallacy_types'].value_counts()

Unnamed: 0_level_0,count
fallacy_types,Unnamed: 1_level_1
bandwagon,144
straw_man,110
appeal_to_fear,14
loaded_language,11
cherry_picking,7
glittering_generalities,5


In [21]:
import pandas as pd

rename_map = {
        'loaded_language': "['loaded_language']",
        'cherry_picking': "['cherry_picking']",
        'glittering_generalities': "['glittering_generalities']",
        'cliche': "['cliche']",
        'euphoria': "['euphoria']",
        'fud': "['fud']",
        'appeal_to_fear': "['appeal_to_fear']",
        'whataboutism': "['whataboutism']",
        'bandwagon': "['bandwagon']",
        'straw_man': "['straw_man']"
    }
    # Apply the renaming to the 'fallacy_types' column
df_filtered_columns['fallacy_types'] = df_filtered_columns['fallacy_types'].map(rename_map).fillna(df_filtered_columns['fallacy_types'])

In [22]:
df_filtered_columns.rename(columns={'fallacy_types': 'techniques'}, inplace=True)
df_filtered_columns.rename(columns={'sentence': 'content'}, inplace=True)

In [23]:
# Assuming filtered_df is the DataFrame you want to save as a CSV
df_filtered_columns.to_csv('filtered_dataset_from_eng.csv', index=False)

# **Combining All the Dataset**

In [73]:
df1=pd.read_csv('/content/filtered_dataset_from_eng.csv')
df2=pd.read_csv('/content/arabic_train_dataset.csv')
df3=pd.read_csv('/content/arabic_dev_dataset.csv')
df4=pd.read_csv('/content/train_dataset.csv')

In [75]:
combined_df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [None]:
combined_df.to_csv('combined_dataset_arab_eng_ukru.csv', index=False)

# **Data Preprocess**

In [76]:
import pandas as pd
import re

# Function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese char
        "\U00002702-\U000027B0"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Convert content to lowercase and remove emojis
combined_df['content'] = combined_df['content'].str.lower().apply(remove_emojis)


In [78]:
from collections import Counter

# List of techniques to count
techniques_to_count = [
    'loaded_language', 'cherry_picking', 'glittering_generalities',
    'cliche', 'euphoria', 'fud', 'appeal_to_fear', 'whataboutism',
    'bandwagon', 'straw_man','no_technique'
]

# Function to split and count techniques
def count_individual_techniques(series):
    all_techniques = series.str.strip('[]').str.replace("'", "").str.split(', ')
    return Counter(technique for sublist in all_techniques for technique in sublist if technique)

# Get the counts
individual_counts = count_individual_techniques(combined_df['techniques'])

# Display the counts for each technique in the list
for technique in techniques_to_count:
    print(f"Total '{technique}':", individual_counts[technique])

Total 'loaded_language': 3584
Total 'cherry_picking': 519
Total 'glittering_generalities': 488
Total 'cliche': 483
Total 'euphoria': 462
Total 'fud': 487
Total 'appeal_to_fear': 378
Total 'whataboutism': 172
Total 'bandwagon': 306
Total 'straw_man': 257
Total 'no_technique': 3903


# **Balancing all the label**

In [79]:
!pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [80]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from deep_translator import GoogleTranslator

# Download necessary NLTK resources
nltk.download('wordnet')

# Function to perform backtranslation
def backtranslate(text, source_lang, intermediate_lang="en"):
    try:
        translated = GoogleTranslator(source=source_lang, target=intermediate_lang).translate(text)
        back_translated = GoogleTranslator(source=intermediate_lang, target=source_lang).translate(translated)
        return back_translated
    except Exception as e:
        print(f"Translation Error: {e}")
        return text

# Function for synonym replacement (SR)
def synonym_replacement(text):
    words = text.split()
    new_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            new_word = synonyms[0].lemmas()[0].name()  # Pick first synonym
            new_words.append(new_word)
        else:
            new_words.append(word)
    return " ".join(new_words)

# Define under-represented techniques
under_represented_techniques = [
    'cherry_picking', 'glittering_generalities', 'cliche',
    'euphoria', 'fud', 'appeal_to_fear', 'whataboutism',
    'bandwagon', 'straw_man'
]

# Load your dataset
df = combined_df.copy()  # Replace with your actual file path

# Augmentation process
new_data = []

# Process samples
for _, row in df.iterrows():
    techniques = row["techniques"]

    if any(technique in techniques for technique in under_represented_techniques):
        lang = 'ar' if any(char in row['content'] for char in 'اأإآءؤئ') else 'en'

        if lang in ['uk', 'ru']:
            # Step 1: Backtranslation
            backtranslated_text = backtranslate(row["content"], lang)

            # Step 2: Apply Synonym Replacement (SR)
            sr_text = synonym_replacement(backtranslated_text)

            # Step 3: Re-translate (Backtranslation Again)
            final_text = backtranslate(sr_text, lang)
        else:
            # Simple Synonym Replacement for English and Arabic
            final_text = synonym_replacement(row["content"])

        # Append new data
        new_data.append({
            "content": final_text,
            "techniques": techniques
        })

# augmented_df1 = pd.DataFrame(new_data)
# augmented_df2 = pd.DataFrame(new_data)
augmented_df3 = pd.DataFrame(new_data)



[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
augmented_df.to_csv('augmented1.csv',index=False)
augmented_df2.to_csv('augmented3.csv',index=False)
augmented_df3.to_csv('augmented3.csv',index=False)

In [None]:
ag1=pd.read_csv('/content/augmented1.csv')
ag2=pd.read_csv('/content/augmented2.csv')
ag3=pd.read_csv('/content/augmented3.csv')

In [None]:
df_augmented=pd.concat([combined_df,ag1,ag2,ag3],ignore_index=True)

In [None]:
df_augmented.to_csv('combined_augmented_dataset_with_dev.csv',index=False)