In [1]:
pip install nlpaug


Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [2]:
import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # This line is usually for the Open Multilingual WordNet, check for Arabic availability


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
under_represented_techniques = [
    'False_Dilemma-No_Choice',
    'Slogans',
    'Repetition',
    'Appeal_to_Time',
    'Conversation_Killer',
    'Red_Herring',
    'Appeal_to_Popularity',
    'Straw_Man',
    'Whataboutism',
    'Guilt_by_Association'
]


In [4]:
import pandas as pd
import nlpaug.augmenter.word as naw

def augment_text_by_technique(text, labels, target_technique):
    augmenter = naw.SynonymAug(aug_src='wordnet', lang='arb')
    augmented_text = text
    total_offset = 0  # Initialize the offset to track net changes in the text length

    # Sort labels to ensure correct order processing
    labels = sorted(labels, key=lambda x: x['start'])
    updated_labels = []

    for label in labels:
        # Adjust indices by current total offset
        adjusted_start_index = label['start'] + total_offset
        adjusted_end_index = label['end'] + total_offset

        # Check if current label needs augmentation
        if label['technique'] == target_technique and adjusted_end_index <= len(augmented_text):
            original_segment = augmented_text[adjusted_start_index:adjusted_end_index]
            augmented_segments = augmenter.augment(original_segment)
            if augmented_segments:
                augmented_segment = augmented_segments[0]
                # Update the augmented text
                augmented_text = augmented_text[:adjusted_start_index] + augmented_segment + augmented_text[adjusted_end_index:]
                # Calculate the change in length
                change_in_length = len(augmented_segment) - len(original_segment)
                total_offset += change_in_length

        # Update the label indices post-augmentation
        label['start'] = adjusted_start_index
        label['end'] = adjusted_start_index + len(augmented_text[adjusted_start_index:adjusted_end_index])
        updated_labels.append(label)

    return augmented_text, updated_labels


# Load your dataset
df = pd.read_csv('trainprop.csv')
df['labels'] = df['labels'].apply(eval)

# Define target technique
target_technique = "Slogans"

# Apply the augmentation function to each row and create a new dataframe of augmented texts
augmented_rows = []
for technique in under_represented_techniques:
    for index, row in df.iterrows():
      for _ in range(5):
          if any(label['technique'] == technique for label in row['labels']):
              augmented_text, new_labels = augment_text_by_technique(row['text'], row['labels'], technique)
              augmented_rows.append({'text': augmented_text, 'labels': new_labels})

# Create a DataFrame from the augmented data
augmented_df = pd.DataFrame(augmented_rows)

# Append the augmented data to the original dataframe using pd.concat, preserving the original
df_augmented = pd.concat([df, augmented_df], ignore_index=True)

# Optionally, save the augmented dataset to a new CSV to keep original data unchanged
# df_augmented.to_csv('train_augmented.csv', index=False)
# print("Augmentation complete. Original data preserved; augmented data appended and saved.")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [5]:
df_augmented

Unnamed: 0,id,text,labels,type
0,7365,تحذيرات من حرب جديدة في حال فشل الانتخابات الق...,"[{'start': 0, 'end': 50, 'technique': 'Appeal_...",tweet
1,1400135121001488384,رب اجعل صباحي هذا غائم بلطفك، لا شر فيه ولا وق...,[],tweet
2,1175652922845216768,#عين_اليمن #26سبتمبر ثورة شعب صنعت تاريخة احتف...,"[{'start': 78, 'end': 89, 'technique': 'Name_C...",tweet
3,1270585163752316928,الحرب على #سورية | حملات تظليل مستمرة .. لمحاو...,"[{'start': 25, 'end': 30, 'technique': 'Loaded...",tweet
4,1395508740703535104,"📣 New Podcast! ""مزيج الخبرة والشباب.. خلطة سحر...","[{'start': 39, 'end': 50, 'technique': 'Name_C...",tweet
...,...,...,...,...
9807,,قامت الصحيفة البريطانية الرائدة “لندن أوبسرفات...,"[{'technique': 'Loaded_Language', 'text': 'الر...",
9808,,قامت الصحيفة البريطانية الرائدة “لندن أوبسرفات...,"[{'technique': 'Loaded_Language', 'text': 'الر...",
9809,,قامت الصحيفة البريطانية الرائدة “لندن أوبسرفات...,"[{'technique': 'Loaded_Language', 'text': 'الر...",
9810,,قامت الصحيفة البريطانية الرائدة “لندن أوبسرفات...,"[{'technique': 'Loaded_Language', 'text': 'الر...",


In [6]:
df_augmented.to_csv("aug.csv",index=False)

In [7]:
df1=df_augmented

In [8]:
def prepare_tag_token(df):
  new_df = pd.DataFrame()

  o_tag = "O"
  cnt = 0

  for index, rec in df.iterrows():
      ner_tags, ner_tokens = [], []

      text = rec['text']  # Access by column
#       if len(rec['labels']) == 0:
#           cnt += 1
#           continue

      ranges = []

      for r in rec['labels']:
          ranges.append((range(r["start"], r["end"] + 1), r["technique"]))

      tokens = text.split()
      token_ranges = []
      c = 0

      for i, token in enumerate(tokens):
          token_ranges.append((range(c, c + len(token)), token))
          c += len(token) + 1

      for token_range in token_ranges:
          is_found = False
          tag = None
          token = None

          for sub_range in ranges:
              if (token_range[0].start in sub_range[0] or token_range[0].stop - 1 in sub_range[0] or
                  (sub_range[0].start in token_range[0] and sub_range[0].stop - 1 in token_range[0])):
                tag = sub_range[1]
                token = token_range[1]
                is_found = True

          if not is_found:
              ner_tags.append(o_tag)
              ner_tokens.append(token_range[1])
          else:
              ner_tags.append(tag)
              ner_tokens.append(token)

      for i, tag in enumerate(ner_tags):
          if i == 0 and ner_tags[i] != o_tag:
              ner_tags[i] = f"B-{ner_tags[i]}"
          elif i > 0 and ner_tags[i] != o_tag:
              if ner_tags[i - 1].replace("B-", "").replace("I-", "") == ner_tags[i]:
                  ner_tags[i] = f"I-{ner_tags[i]}"
              else:
                  ner_tags[i] = f"B-{ner_tags[i]}"

      # Construct the new row dictionary
      new_row = rec.to_dict()
      new_row['tag'] = ner_tags
      new_row['token'] = ner_tokens

      # Append the new row to the new DataFrame
      new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
      cnt += 1

  new_df = new_df.drop(['id', 'text', 'labels'], axis=1)
  return new_df

In [9]:
df1 = prepare_tag_token(df1)

In [11]:
def clean_tags(tag_list):
    cleaned_tags = []
    for tag in tag_list:
        if '-' in tag:
            cleaned_tags.append(tag.split('-', 1)[-1])
        elif tag == 'O':
            cleaned_tags.append('No technique')
    return cleaned_tags

# Apply cleaning function to the 'tags' column
df1['cleaned_tags'] = df1['tag'].apply(clean_tags)
from itertools import chain

# Flatten the list of cleaned tags
all_tags = list(chain.from_iterable(df1['cleaned_tags']))

# Convert to series and count frequencies
tag_counts = pd.Series(all_tags).value_counts()
tag_counts

No technique                        224965
Loaded_Language                      19032
Questioning_the_Reputation           13160
Exaggeration-Minimisation             7922
False_Dilemma-No_Choice               6118
Name_Calling-Labeling                 5851
Appeal_to_Authority                   5320
Causal_Oversimplification             5236
Doubt                                 5217
Flag_Waving                           4821
Slogans                               3905
Conversation_Killer                   3335
Appeal_to_Time                        3254
Red_Herring                           3148
Repetition                            3093
Appeal_to_Fear-Prejudice              2905
Straw_Man                             2879
Appeal_to_Popularity                  2721
Appeal_to_Values                      2588
Obfuscation-Vagueness-Confusion       2576
Appeal_to_Hypocrisy                   2442
Whataboutism                          1851
Consequential_Oversimplification      1841
Guilt_by_As