# User Input

In [None]:
# enter the location of lexicons (please make sure that there are only lexicons files in the folder)
lexicons_loc_map = {
    'nasdaq': '/content/drive/MyDrive/nasdaq/lexicons',
    'fpb': '/content/drive/MyDrive/fpb/lexicons',
    'sentfin': '/content/drive/MyDrive/sentfin/lexicons'
}

# enter the folder location where the result dataset should be saved
time_evaluation_df_loc = '/content/drive/MyDrive/'


# enter the location of the tokenizer
tokenizer_loc = '/content/drive/MyDrive/roberta_tokenizer'

# enter the location of all evaluation datasets (please make sure that there are only evaluation files in the folder)
eval_datasets_folder_loc = '/content/drive/MyDrive/datasets/evaluation datasets'

# ShapDictModel

In [None]:
!pip install transformers==4.31.0

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score, matthews_corrcoef
import re
from sklearn.metrics import confusion_matrix
import torch
import nltk
import torch
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

class ShapDictModel:
  def __init__(self, tm_words, lm_words, tokenizer, word_column, category_column, decision_makers, count_column):
    self.dataset_source = 'both'
    self.word_column = word_column
    self.tokenizer = tokenizer
    self.count_column = count_column

    # Lemmatizer
    self.lemmatizer = WordNetLemmatizer()

    # source column values
    self.lm_source = 'LM'
    self.tm_source = 'OUR_WORDS'

    # column prefix
    self.lm_prefix = 'LM_'
    self.tm_prefix = 'TM_'

    # category values
    self.positive_category_value = 'positive'
    self.negative_category_value = 'negative'

    # opposite prefix
    self.opposite_prefix = 'opposite_'

    # prefix when one source is chosen
    self.prefix = ''

    # source column postfix
    self.source_column = 'src'

    # dataset on which results are calculated
    self.tm_words = tm_words
    self.lm_words = lm_words

    # function that will calculate the score
    self.calculate_score = self.calculate_score_both_dataset_sources

    # category when one source is chosen
    self.category = category_column

    # decision makers
    self.decision_makers = decision_makers

  def calculate_dataset_source_score(self, words):

    word_category = words[0]

    selected_category_sign = 1
    opposite_category_sign = 1
    if word_category == self.positive_category_value:
      opposite_category_sign = -1
    elif word_category == self.negative_category_value:
      selected_category_sign = -1

    if selected_category_sign == opposite_category_sign:
      return 0, 0

    selected_category_score = words[1] * selected_category_sign
    opposite_category_score = words[2] * opposite_category_sign

    return selected_category_score, opposite_category_score

  def calculate_score_both_dataset_sources(self, word, coefficients):
    tm_accumulated_score, tm_opposite_accumulated_score = self.calculate_dataset_source_score(self.tm_words[word])
    lm_accumulated_score, lm_opposite_accumulated_score = self.calculate_dataset_source_score(self.lm_words[word])

    c1, c2, c3, c4 = coefficients

    weighted_score = (c1 * tm_accumulated_score + c2 * tm_opposite_accumulated_score + c3 * lm_accumulated_score + c4 * lm_opposite_accumulated_score)

    return weighted_score

  def decide(self, word, coefficients):
    if word not in self.tm_words or word not in self.lm_words:
      return 0

    return self.calculate_score(word, coefficients)

  def predict_sentence_label(self, sentence, label_t, coefficients):
    words = self.tokenizer.tokenize(sentence)
    cleaned_words = self.clean_and_lemmatize_words(words)
    decision_score = sum([self.decide(word, coefficients) for word in cleaned_words])

    return 1 if decision_score > 0 else 0 if decision_score < 0 else -1

  def __get_wordnet_pos(self, word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

  def clean_and_lemmatize_words(self, words):
    lower_case_words = [str(word).lower().replace('ġ', '').strip() for word in words]
    return [self.lemmatizer.lemmatize(word, self.__get_wordnet_pos(word)) for word in lower_case_words]

  def predict_and_evaluate(self, sentences, true_labels, coefficients):
    predicted_labels = [self.predict_sentence_label(sentence, label, coefficients) for sentence, label in zip(sentences, true_labels)]

    return predicted_labels

In [None]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd

def extract_file_name(file_loc):
  return file_loc.split('/')[-1].split('.')[0]

def extract_datasets_map(datasets_location):
  location = datasets_location if datasets_location[-1] == '/' else f'{datasets_location}/'
  files_locations = [join(location, f) for f in listdir(location) if isfile(join(location, f))]

  print(f'Reading datasets from: {location} ...')

  assert files_locations != 0, 'No files found in the provided location'

  datasets_map = {}
  for f in files_locations:
    print(f'Reading dataset: {f} ...')
    dataset = pd.read_csv(f)
    datasets_map[extract_file_name(f)] = dataset

  print(f'Reading datasets successfully finished ...')

  return datasets_map


def create_results_folder(loc):
  parent_location = os.path.abspath(os.path.join(loc, os.pardir))
  mod_location = parent_location if parent_location[-1] == '/' else f'{parent_location}/'

  results_location = f'{mod_location}results'

  if not os.path.exists(results_location):
    os.makedirs(results_location)

  print(f'Created results dataset on location: {results_location} ...')

  return results_location

# Time Evaluation

In [None]:
def create_dicts(prefix, dataset):
  # extracting only the necessary columns included in the decision making process
  source_dict = {}
  for item in dataset[['word', f'{prefix}_category', f'{prefix}_average_shap_values', f'{prefix}_opposite_average_shap_values']].values:
    source_dict[item[0]] = item[1:]

  return source_dict

In [None]:
# Evaluating lexicons execution time

import time
import timeit

def run_model():
  return shap_dict_model.predict_and_evaluate(sentences, labels, [0.8, 0.2, 0.9, 0.5])

eval_datasets_map = extract_datasets_map(eval_datasets_folder_loc)
tokenizer = torch.load(tokenizer_loc)

rows = []

for src in lexicons_loc_map:
  lexicon_loc = lexicons_loc_map[src]
  lexicon_map = extract_datasets_map(lexicon_loc)

  for lex in lexicon_map:

    df = lexicon_map[lex]
    tm_dict = create_dicts('TM', df)
    lm_dict = create_dicts('LM', df)

    for eval_name in eval_datasets_map:
      if src == 'fpb' and eval_name == 'financial_phrase_bank':
        continue

      eval_df = eval_datasets_map[eval_name]
      sentences = eval_df.text.values
      labels = eval_df.sentiment.values

      shap_dict_model = ShapDictModel(tm_dict, lm_dict, tokenizer, 'word', 'category', ['average_shap_values'], 'count')

      num_times = 10
      execution_time = timeit.timeit(run_model, number=num_times)
      duration = execution_time / num_times

      lex_num = len(df)
      sent_num = len(sentences)
      normalized = 'normalized' in lex
      new_row = [src, normalized, lex_num, eval_name, sent_num, duration]
      rows.append(new_row)

cols = ['Source Lexicon', 'Normalized', 'Lexicon No. Words', 'Eval Dataset', 'Sentences No.', 'Time in s']

time_df = pd.DataFrame(rows, columns = cols)

time_df_loc = f'{time_evaluation_df_loc}/roberta_lexicons_time_evaluation_average_10_times.csv'
time_df.to_csv(time_df_loc, index=False)