## Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Installs and imports

In [None]:
!pip install -U spacy
!pip install imbalanced-learn
!pip install wordnet
!pip install pandas
!pip install tqdm
!pip install simpletransformers
!pip install nltk
!pip install python-Levenshtein
!pip install openai

In [None]:
!python -m spacy download en_core_web_lg
!python -m spacy download fr_core_news_lg
!python -m spacy download pl_core_news_lg
!python -m spacy download ru_core_news_lg
!python -m spacy download it_core_news_lg
!python -m spacy download de_core_news_lg

In [None]:
import pandas as pd
import os
import spacy
import random
import json
import glob
from ast import literal_eval
from tqdm import tqdm, trange
from simpletransformers.classification import (
    MultiLabelClassificationModel, MultiLabelClassificationArgs
)
import numpy as np
import pandas as pd
import pickle
import sklearn
from sklearn import metrics
import nltk
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from Levenshtein import distance
import itertools

import openai
openai.api_key = 'api-key'

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
lemmatizer = WordNetLemmatizer()

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(cuda.is_available())

pd.set_option('display.max_colwidth',100)

nlp = {
    'en': spacy.load("en_core_web_lg"),
    'fr': spacy.load("fr_core_news_lg"),
    'po': spacy.load("pl_core_news_lg"),
    'ru': spacy.load("ru_core_news_lg"),
    'it': spacy.load("it_core_news_lg"),
    'ge': spacy.load("de_core_news_lg"),
}


In [None]:
language_to_wordnet_language = {
    'en': 'eng',
    'fr': 'fra',
    'it': 'ita',
    'ge': None,
    'po': 'pol',
    'ru': None
}

def spacy_to_wordnet_pos_tag(pos_tag):
  if pos_tag.startswith(('ADJ', 'JJ')):
    return wn.ADJ
  elif pos_tag.startswith(("ADV", "RB", "RP")):
    return wn.ADV
  elif pos_tag.startswith(("NN", "NOUN")):
    return wn.NOUN
  elif pos_tag.startswith(("VERB", "VB")):
    return wn.VERB
  else:
    return None

# Preprocessing

## Parse datasets

In [None]:
all_labels = [
    'Appeal_to_Authority',
    'Appeal_to_Popularity',
    'Appeal_to_Values',
    'Appeal_to_Fear-Prejudice',
    'Flag_Waving',
    'Causal_Oversimplification',
    'False_Dilemma-No_Choice',
    'Consequential_Oversimplification',
    'Straw_Man',
    'Red_Herring',
    'Whataboutism',
    'Slogans',
    'Appeal_to_Time',
    'Conversation_Killer',
    'Loaded_Language',
    'Repetition',
    'Exaggeration-Minimisation',
    'Obfuscation-Vagueness-Confusion',
    'Name_Calling-Labeling',
    'Doubt',
    'Guilt_by_Association',
    'Appeal_to_Hypocrisy',
    'Questioning_the_Reputation',
]

In [None]:
def make_dataframe(input_folder, labels_fn=None, language=None):
    # MAKE TXT DATAFRAME
    text = []
    for fil in tqdm(filter(lambda x: x.endswith('.txt'), os.listdir(input_folder))):
        iD = fil[7:].split('.')[0]
        lines = list(enumerate(open(input_folder + fil, 'r', encoding = 'utf-8').read().splitlines(), 1))
        text.extend([(iD,) + line for line in lines])

    df_text = pd.DataFrame(text, columns=['id', 'line', 'text'])
    df_text.id = df_text.id.apply(int)
    df_text.line = df_text.line.apply(int)
    df_text = df_text[df_text.text.str.strip().str.len() > 0].copy()
    df_text = df_text.set_index(['id', 'line'])

    df_text['language'] = language

    df = df_text

    if labels_fn:
        # MAKE LABEL DATAFRAME
        labels = pd.read_csv(labels_fn, sep='\t', encoding = 'utf-8', error_bad_lines=False, header=None)
        labels = labels.rename(columns={0: 'id', 1: 'line', 2: 'labels'})
        labels = labels.set_index(['id', 'line'])
        labels = labels[labels.labels.notna()].copy()

        # JOIN
        df = labels.join(df_text)[['text', 'labels', 'language']]

    return df


def get_all_languages():
    return [x for x in os.listdir('./drive/MyDrive/Facultate/MasterAnul2/NLP/data/')]

def load_train_and_dev(language):
    test_dataset = make_dataframe(f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/train-articles-subtask-3/",
                                  f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/train-labels-subtask-3.txt",
                                  language=language)
    dev_dataset = make_dataframe(f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/dev-articles-subtask-3/",
                                 f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/dev-labels-subtask-3.txt",
                                 language=language)
    return test_dataset, dev_dataset

def load_train_and_dev_all():
    all_languages = get_all_languages()
    test_dataset = pd.DataFrame()
    dev_dataset = pd.DataFrame()
    for language in all_languages:
       print("language", language)
       test_dataset = pd.concat([test_dataset, make_dataframe(f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/train-articles-subtask-3/",
                                  f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/train-labels-subtask-3.txt", language=language)])
       dev_dataset = pd.concat([dev_dataset, make_dataframe(f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/dev-articles-subtask-3/",
                                                            f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/dev-labels-subtask-3.txt",
                                                            language=language)])

    test_dataset = test_dataset.sample(frac=1)
    dev_dataset = dev_dataset.sample(frac=1)

    return pd.concat([test_dataset, dev_dataset])

def load_test_all():
    all_languages = get_all_languages()
    test_dataset = pd.DataFrame()
    for language in all_languages:
       print("language", language)
       test_dataset = pd.concat([test_dataset, make_dataframe(f"./drive/MyDrive/Facultate/MasterAnul2/NLP/data/{language}/test-articles-subtask-3/",
                                  language=language)])

    test_dataset = test_dataset.sample(frac=1)

    return test_dataset

def extend_with_synonyms(df, nb_rows_to_sample):
  sample_df = df.sample(n=nb_rows_to_sample, replace=True)

  new_sample_df = pd.DataFrame(columns=df.columns)
  new_sample_df.astype(object)

  for idx, row in sample_df.iterrows():
    #words = row['text'].split()
    new_words_lst = []

    for token in nlp[row['language']](row['text']):
      word = token.text
      pos_tag = spacy_to_wordnet_pos_tag(token.pos_)

      synonym_found = False

      if pos_tag is not None and language_to_wordnet_language[row['language']] is not None:

        lang = language_to_wordnet_language[row['language']]
        synsets = wn.synsets(word, pos=pos_tag, lang=lang)

        if len(synsets) > 0:
          lemmas = synsets[0].lemma_names(lang)

          for lemma in lemmas:
            if distance(word.lower(), lemma) <= 3:
              lemmas.remove(lemma)

          if len(lemmas) > 0:
            random.shuffle(lemmas)
            new_words_lst.append(lemmas[0].replace('_', ' '))

            synonym_found = True
      
      if not synonym_found:
        new_words_lst.append(word)

    new_words = " ".join(new_words_lst)
    #print(f"Original={row['text']}")
    #print(f"New={new_words}")
    
    new_sample_df = new_sample_df.append({'text': new_words, 'labels': row['labels'], 'language': row['language']}, ignore_index=True)

  #return new_sample_df
  return pd.concat([df, new_sample_df])

def extend_with_old_datasets(df):
  old_to_new_labels = {
      'Appeal to authority': 'Appeal_to_Authority',
      'Appeal to fear/prejudice': 'Appeal_to_Fear-Prejudice',
      'Causal Oversimplification': 'Causal_Oversimplification',
      'Doubt': 'Doubt',
      'Exaggeration/Minimisation': 'Exaggeration-Minimisation',
      'Flag-waving': 'Flag_Waving',
      'Loaded Language': 'Loaded_Language',
      "Misrepresentation of Someone's Position (Straw Man)": 'Straw_Man',
      'Name calling/Labeling': 'Name_Calling-Labeling',
      'Obfuscation, Intentional vagueness, Confusion': 'Obfuscation-Vagueness-Confusion',
      'Presenting Irrelevant Data (Red Herring)': 'Red_Herring',
      'Reductio ad hitlerum': 'Questioning_the_Reputation',
      'Repetition': 'Repetition',
      'Slogans': 'Slogans',
      'Whataboutism': 'Whataboutism'
  }

  new_df = pd.DataFrame(columns=df.columns)
  new_df.astype(object)

  for dataset_path in glob.iglob('./drive/MyDrive/Facultate/MasterAnul2/NLP/old_data/*.txt'):

    with open(dataset_path, 'r') as f:
      json_data = json.load(f)

      for item in json_data:
        labels = []

        for label in item['labels']:
          if label in old_to_new_labels:
            labels.append(old_to_new_labels[label])

        if len(labels) > 0:
          labels_as_text = ",".join(labels)
          text = item['text'].replace('\n', ' ')

          new_row = pd.Series({"text": text, 'labels': labels_as_text, 'language': 'en'})
          new_df = pd.concat([new_df, new_row.to_frame().T], ignore_index=True)

  return pd.concat([df, new_df])
 
def chatgpt_to_file(path, n_responses, language):
  # 0.9 / 17 for under-represented labels, 0.10 / 6 for over-represented ones
  probabilities = [0.05294117647 for _ in range(len(all_labels))]
  probabilities[3] = 0.01666666666 # Appeal_to_Fear-Prejudice
  probabilities[14] = 0.01666666666 # Loaded_Language
  probabilities[16] = 0.01666666666 # Exaggeration-Minimisation
  probabilities[18] = 0.01666666666 # Name_Calling-Labeling
  probabilities[19] = 0.01666666666 # Doubt
  probabilities[22] = 0.01666666666 # Questioning_the_Reputation

  with open(path, 'w') as f:
    for _ in range(n_responses):
      n_labels = np.random.randint(low=1, high=(len(all_labels) // 4))
      random_labels = []
      original_labels = []

      for _ in range(n_labels):
        random_label = np.random.choice(all_labels, p=probabilities)
        original_labels.append(random_label)

        random_label = random_label.replace('_', ' ').replace('-', '/').lower()
        random_labels.append(random_label)

      original_labels = ','.join(original_labels)

      prompt_labels = ', '.join(random_labels)
      prompt = f'Generate a paragraph containing {prompt_labels} as persuasion techniques in {language}'

      response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        max_tokens=500
      )

      for choice in response['choices']:
        text = choice['text'].replace('\n', ' ')
        f.write(f"{text}\t{original_labels}\n")

def extend_with_chatgpt(df):
  new_df = pd.DataFrame(columns=df.columns)
  new_df.astype(object)

  for language in get_all_languages():
    for dataset_path in glob.iglob(f'./drive/MyDrive/Facultate/MasterAnul2/NLP/chatgpt_data/{language}/*.txt'):
      if os.path.exists(dataset_path):
        with open(dataset_path, 'r') as f:
          for line in f:
            line = line.rstrip().lstrip()
            split_line = line.split('\t')

            text = split_line[0]
            labels = split_line[1]

            new_row = pd.Series({"text": text, 'labels': labels, 'language': language})
            new_df = pd.concat([new_df, new_row.to_frame().T], ignore_index=True)

  return pd.concat([df, new_df])

def extend_with_translated_data(df):
  new_df = pd.DataFrame(columns=df.columns)
  new_df.astype(object)

  for language in get_all_languages():
    for dataset_path in glob.iglob(f'./drive/MyDrive/Facultate/MasterAnul2/NLP/translated_data/{language}/*.txt'):
      if os.path.exists(dataset_path):
        with open(dataset_path, 'r') as f:
          for line in f:
            line = line.rstrip().lstrip()
            split_line = line.split('\t')

            text = split_line[0]
            labels = split_line[1]

            new_row = pd.Series({"text": text, 'labels': labels, 'language': language})
            new_df = pd.concat([new_df, new_row.to_frame().T], ignore_index=True)

  return pd.concat([df, new_df])

def print_and_return_freq_labels(df1):
  frecv_lang_label = {}
  frecv_lang_label['all'] = {}
  for lang in get_all_languages():
    frecv_lang_label[lang] = {}
    for label in all_labels:
      frecv_lang_label[lang][label] = 0
      frecv_lang_label['all'][label] = 0

  for i, row in df1.iterrows():
    labels = row['labels'].split(',')
    for label in labels:
      frecv_lang_label[row['language']][label] += 1
      frecv_lang_label['all'][label] += 1

  for lang in get_all_languages():
    print("\n", lang)
    for label in all_labels:
      print(label, ' ', frecv_lang_label[lang][label])

  print("\n", 'all')
  for label in all_labels:
    print(label, ' ', frecv_lang_label['all'][label])
  return frecv_lang_label

def labels_small_freq_sorted(labels_freq_dict):
  return  {k: v for k, v in sorted(labels_freq_dict.items(), key=lambda item: item[1]) if v < 500}

def dataframe_with_labels(df, labels):
  dataset = []

  for index, row in df.iterrows():
    text = row['text']
    labels = row['labels'].split(',')

    labels_positions = []

    for label in all_labels:
      if label in labels:
        labels_positions.append(1)
      else:
        labels_positions.append(0)

    dataset.append([text, labels_positions])

  dataset_df = pd.DataFrame(dataset)
  dataset_df.columns = ["text", "labels"]

  return dataset_df

def get_bad_labels(labels_freq_dict):
    return [x for x in labels_freq_dict if labels_freq_dict[x] > 1600]

def is_extension_ok(row_labels, label, bad_labels):
  if label not in row_labels:
    return False
  for bad_label in bad_labels:
    if bad_label in row_labels:
      return False
  return True

  # if label not in row_labels:
  #   return False
  # return True

def extend_with_synonyms_balanced(df):

  frecv_lang_label = print_and_return_freq_labels(df)
  sorted_labels_with_small_freq = labels_small_freq_sorted(frecv_lang_label['all'])

  bad_labels = get_bad_labels(frecv_lang_label['all'])
  data_final = pd.DataFrame(columns=data.columns)
  for label in sorted_labels_with_small_freq.keys():   
    data_for_label = pd.DataFrame(columns=data.columns)
    for i, row in df.iterrows():
      if is_extension_ok(row["labels"], label, bad_labels):
        data_for_label.loc[len(data_for_label)] = row

    #  if data_for_label.shape[0] < 500:
    data_for_label = extend_with_synonyms(data_for_label, int(data_for_label.shape[0] * 1.5))
      
    data_final = pd.concat([data_final, data_for_label])
    #display(data_final)

  # data_final = extend_with_synonyms(train_df, int(train_df.shape[0] * 1.5))
  return pd.concat([data_final, df])

def create_dataframe_with_extension(df, extension):
  if extension == None:
    return
  elif extension == 'chatgpt':
    return extend_with_chatgpt(df)
  elif extension == 'old_data':
    return extend_with_old_datasets(df)
  elif extension == 'translated_data':
    return extend_with_translated_data(df)
  elif extension == 'synonyms':
    return extend_with_synonyms_balanced(df)
  else:
    raise Exception("Unsupported extension")

def create_dataframe_combinations(df):
  combinations = ['chatgpt', 'old_data', 'translated_data', 'synonyms']

  for length in range(len(combinations) + 1):
      for subset in itertools.combinations(combinations, length):
          new_df = df.copy(deep=True)

          for extension in list(subset):
            new_df = create_dataframe_with_extension(new_df, extension)

          new_df = dataframe_with_labels(new_df, all_labels)
          dataframe_name = '_'.join(list(subset)) + ".pkl"
          new_df.to_pickle(dataframe_name)

In [None]:
data = load_train_and_dev_all()
#test_data = load_test_all()

#data = data.reset_index(drop=True)
#test_data = test_data.reset_index(drop=True)

display(test_data)

language en


54it [00:12,  4.21it/s]


language fr


50it [00:11,  4.45it/s]


language ru


72it [00:15,  4.51it/s]


language po


47it [00:10,  4.51it/s]


language it


61it [00:14,  4.19it/s]


language ge


50it [00:12,  4.15it/s]


language es


30it [00:07,  4.17it/s]


language gr


64it [00:15,  4.11it/s]


language ka


29it [00:05,  5.34it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,text,language
id,line,Unnamed: 2_level_1,Unnamed: 3_level_1
3636,17,"Alla domanda se fosse stato preso in considerazione un tetto al prezzo del gas russo, il diploma...",it
3713,23,La ley rusa establece que todos los ciudadanos rusos de entre 18 y 27 años están obligados a cum...,es
3149,11,"Critics of the president, as well as ethics officials, have criticized the move citing federal e...",en
3544,39,"Można by nieśmiało założyć, że to w jego domu rodzinnym odbywało się zapoczątkowane kluskowanie ...",po
3132,35,"""I don't remember the precedent in world history when territories we don't even control were abs...",en
...,...,...,...
3636,5,"“Abbiamo ripetuto più volte che non rifiutiamo mai incontri. Se c’è una proposta, la prenderemo ...",it
3148,25,"When reached for comment, Director Arbogast replied via phone message: “Sadly, this is now a leg...",en
3819,29,"Δείτε όλες τις τελευταίες Ειδήσεις από την Ελλάδα και τον Κόσμο, τη στιγμή που συμβαίνουν, στο P...",gr
319,57,"“I’m going to get emotional about this issue, because it’s horrible. But it’s like kids who are ...",en


In [None]:
dataframe_with_labels(data, all_labels).to_pickle('standard.pkl')
#test_data.to_pickle('test.pkl')

In [None]:
create_dataframe_combinations(data)


 en
Appeal_to_Authority   182
Appeal_to_Popularity   49
Appeal_to_Values   0
Appeal_to_Fear-Prejudice   447
Flag_Waving   383
Causal_Oversimplification   237
False_Dilemma-No_Choice   185
Consequential_Oversimplification   0
Straw_Man   24
Red_Herring   63
Whataboutism   18
Slogans   181
Appeal_to_Time   0
Conversation_Killer   116
Loaded_Language   2292
Repetition   685
Exaggeration-Minimisation   581
Obfuscation-Vagueness-Confusion   31
Name_Calling-Labeling   1229
Doubt   705
Guilt_by_Association   63
Appeal_to_Hypocrisy   48
Questioning_the_Reputation   0

 fr
Appeal_to_Authority   116
Appeal_to_Popularity   99
Appeal_to_Values   144
Appeal_to_Fear-Prejudice   272
Flag_Waving   47
Causal_Oversimplification   169
False_Dilemma-No_Choice   102
Consequential_Oversimplification   165
Straw_Man   158
Red_Herring   64
Whataboutism   74
Slogans   176
Appeal_to_Time   55
Conversation_Killer   222
Loaded_Language   1194
Repetition   113
Exaggeration-Minimisation   332
Obfuscation-Vagueness