In [1]:
# Libraries
import pandas as pd
import datasets
import os

from sklearn.model_selection import train_test_split
from ml_things import plot_confusion_matrix
from transformers import logging
from tqdm import tqdm

# Custom functions/classes/variables
from utils import preprocessing, models_handler
from utils.multiagent import WeightedAverage, ProbabilitiesSum, Plurality, Borda
from utils.constants import TEST_SAMPLES, RANDOM_STATE, TEST_SIZE


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fdavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fdavi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fdavi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Disable transformers warnings
logging.set_verbosity(logging.ERROR)

In [3]:
if not os.path.isfile('cleaned_data.csv'):

    print("Cleaned dataset not found, proceeding to create it.")

    # Load the dataset 
    data = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech')['train'].to_pandas()
    data['class'] = [2 if c < -1 else (1 if c > -1 and c < 0.5 else 0) for c in data['hate_speech_score']]

    # Preprocess the text column
    data['text'] = preprocessing.apply_all(data['text'])
    data = data[data['text'].notna()]

    # Keeping only the wanted columns
    data = data[['text', 'class']]
    data.to_csv('cleaned_data.csv', index = False)
else:
    print("Cleaned dataset found, loading it.")

    data = pd.read_csv('cleaned_data.csv')


# Splitting the dataset
print("Splitting the dataset.")
X_train, X_test, y_train, y_test = train_test_split(list(data["text"]), list(data["class"]), test_size=TEST_SIZE, random_state=RANDOM_STATE)

# Initializing models
print("Initializing models.")
model_gpt2, tokenizer_gpt2 = models_handler.get_gpt2(True)
model_bert, tokenizer_bert = models_handler.get_bert(True)
model_xlnet, tokenizer_xlnet = models_handler.get_xlnet(True)
model_logreg, vectorizer_logreg = models_handler.get_logreg(True)

# Testing the models
print("Testing the models.")

# Taking the first TEST_SAMPLES form the test set
input_sentences = X_test[0:TEST_SAMPLES]
output_labels = y_test[0:TEST_SAMPLES]
input_lr = vectorizer_logreg.transform(input_sentences)

# Getting the saved accuracies for each model
accuracies = models_handler.get_accuracies()

# Multiagent voting rules
rules = [WeightedAverage(), ProbabilitiesSum(), Plurality(), Borda()]

# inn will be the input to NN models: gpt2, xlnet and bert
# ilr will be the input to the logreg model
for ilr, inn, label in tqdm(zip(input_lr, input_sentences, output_labels), total = TEST_SAMPLES):

    predictions = {}

    # Make predictions
    predictions['lr'] = models_handler.predict_lr(model_logreg, ilr)
    predictions['xlnet'] = models_handler.predict_nn(model_xlnet, tokenizer_xlnet, inn)
    predictions['bert'] = models_handler.predict_nn(model_bert, tokenizer_bert, inn)
    predictions['gpt2'] = models_handler.predict_nn(model_gpt2, tokenizer_gpt2, inn)

    # Calling each multiagent voting rule and updating the internal results
    for rule in rules:
        rule(accuracies, predictions, label)


# Printing results and plotting confusion matrices
for rule in rules:
    print("Multiagent rule: ", rule.__class__.__name__)
    print(f"Predicted correctly {rule.correct_counter} out of {len(input_sentences)} sentences (accuracy {rule.correct_counter / len(input_sentences)}).")
    plot_confusion_matrix(y_true=output_labels, y_pred=rule.predicted_classes, classes=[0, 1, 2], normalize=True, use_title=f"Confusion Matrix - Multiagent ({rule.__class__.__name__})")


Cleaned dataset found, loading it.
Splitting the dataset.
Initializing models.
Testing the models.


  0%|          | 0/1000 [00:00<?, ?it/s]

[1.91735801e-04 2.94838786e-07 9.99807969e-01]





TypeError: Borda.get_borda_scores() takes 1 positional argument but 2 were given

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# input_sentences = X_train
# output_labels = y_train
# print(data[data['text'].isnull()].index.tolist())
# print(data[data['text'].isnull()]['class'].tolist())
# input_lr = vectorizer_logreg.transform(input_sentences)

# model = LogisticRegression(C=1000, max_iter=300, solver='newton-cg')
# model.fit(input_lr, y_train)

# X_test_vect = vectorizer_logreg.transform(X_test)
# logreg_pred = model.predict(X_test_vect)
# print(f"Accuracy: {accuracy_score(logreg_pred, y_test):.3f}")

# from joblib import dump

# dump(model, os.path.join(os.getcwd(), "models", 'LogisticRegression_model.joblib'))

[]
[]
Accuracy: 0.897


['c:\\Users\\fdavi\\Desktop\\HateSpeechDetection\\models\\LogisticRegression_model.joblib']