In [88]:
import json
import re

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [None]:
# Load training data
with open("./data/train.json", encoding='utf-8') as user_file:
    records = json.load(user_file)
train_data = [(record["text"], record["labels"]) for record in records]

In [109]:
def preprocess_data(data):
    data = [(re.sub(r'\\n', ' ', record[0]), record[1]) for record in data]
    data = [(re.sub(r'\n', r'\.', record[0]), record[1]) for record in data]
    data = [(re.sub(r'\\', r' ', record[0]), record[1]) for record in data]
    return data

In [110]:
# Preprocess data
train_data = preprocess_data(train_data)
labels = set([label for record in records for label in record["labels"]])

In [69]:
train_data

[('THIS IS WHY YOU NEEDA SHARPIE WITH YOU AT ALL TIMES',
  ['Black-and-white Fallacy/Dictatorship']),
 ('GOOD NEWS!NAZANIN ZAGHARI-RATCLIFFE AND ANOOSHEH ASHOORI HAVE BEEN RELEASEDAfter years of being unjustly detained in Iran, they are making their way safely back to the UK',
  ['Loaded Language', 'Glittering generalities (Virtue)']),
 ('PAING PHYO MIN IS FREE!', []),
 ('Move your ships away!oooookMove your ships away!No, and I just added 10 more',
  []),
 ("WHEN YOU'RE THE FBI, THEY LET YOU DO IT", ['Thought-terminating cliché']),
 ("PUTIN'S SECRET CAMOUFLAGE ARMY", []),
 ("JANUARY 2021YOU DON'T BUY A 12 MILLION DOLLAR WATERFRONT HOUSE ON MARTHA'S VINEYARD IF YOU REALLY BELIEVE THE OCEANS ARE RISING",
  ['Whataboutism']),
 ("Term Limits Are Everywhere & Politicians Can't Ignore ItTERM LIMITS AHEAD",
  ['Slogans']),
 ("Nicola SturgeonWE'RE SCOTTISH GETUSOUTOFHERE!Will his bushtucker trial involve skydiving into the jungle without the appropriate PPE",
  []),
 ("I saw a movie once wher

In [70]:
# Tokenize and tag each sentence
tagged_data = [TaggedDocument(words=word_tokenize(sentence.lower()), tags=labels) for sentence, labels in train_data]

In [147]:
# Train the Doc2Vec model
model = Doc2Vec(vector_size=200, window=12, min_count=3, workers=12, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [155]:
# Poor man's validation
iou = 0
for index, (sentence, labels) in enumerate(train_data[::20]):
    new_vector = model.infer_vector(word_tokenize(sentence.lower()), epochs=500, alpha=0.75)
    similarities = model.dv.most_similar([new_vector])
    threshold = 0.33  # Adjust as needed
    assigned_labels = [label for label, similarity in similarities if similarity > threshold]
    if not index % 100: 
        print(f"Sentence {index}: {sentence}, labels: {labels}, assigned: {assigned_labels}")
    iou += len(set(labels) & set(assigned_labels)) / len(set(labels) | set(assigned_labels)) \
        if len(set(labels) | set(assigned_labels)) \
        else 1
iou = iou / len(train_data) * 20
print(f"Average intersection over union: {iou}")

Sentence 0: THIS IS WHY YOU NEEDA SHARPIE WITH YOU AT ALL TIMES, labels: ['Black-and-white Fallacy/Dictatorship'], assigned: ['Doubt', 'Presenting Irrelevant Data (Red Herring)', 'Obfuscation, Intentional vagueness, Confusion']
Sentence 100: D-DAYJUNE 6, 1944ALL GAVE SOME, SOME GAVE ALL, labels: ['Repetition', 'Flag-waving', 'Slogans'], assigned: ['Flag-waving']
Sentence 200: THINK YOUR JOB IS TOUGHHOW'D YOU LIKE TO BE THE SIGN LANGUAGE INTERPRETER AT BIDEN SPEECH, labels: ['Smears'], assigned: ['Smears']
Sentence 300: WHEN ONE IS DEPRIVED OF ONES LIBERTY, ONE IS RIGHT IN BLAMING NOT SO MUCH THE MAN WHO PUTS THE SHACKLES ON AS THE ONE WHO HAD THE POWER TO PREVENT HIM, BUT DID NOT USE ITThucydides, History of the Peloponnesian War, labels: ['Appeal to authority'], assigned: ['Appeal to authority', 'Thought-terminating cliché']
Average intersection over union: 0.3306326530612244


In [116]:
# # Example using Gaussian Mixture Model (GMM) clustering
# num_clusters = len(labels)
# gmm = GaussianMixture(n_components=num_clusters)
# gmm.fit(model.dv.vectors)

In [115]:
# # Get cluster assignments for each data point
# cluster_labels = gmm.predict(model.dv.vectors)
# 
# # Count the occurrences of each cluster label
# cluster_sizes = {label: np.sum(cluster_labels == label) for label in np.unique(cluster_labels)}
# 
# # Print cluster sizes
# for label, size in cluster_sizes.items():
#     print(f"Cluster {label}: Size {size}")

In [119]:
index_to_key = dict(enumerate(model.dv.index_to_key))
# Load validation data
with open("./data/validation.json", encoding='utf-8') as user_file:
    records = json.load(user_file)
validation_data = [(record["text"], record["labels"]) for record in records]
validation_data = preprocess_data(validation_data)

In [164]:
iou = 0
for index, (sentence, labels) in enumerate(validation_data):
    new_vector = model.infer_vector(word_tokenize(sentence.lower()), epochs=70, alpha=0.75)
    similarities = model.dv.most_similar([new_vector])
    threshold = 0.5  # Adjust as needed
    assigned_labels = [label for label, similarity in similarities if similarity > threshold]
    print(f"Sentence {index}: {sentence}, labels: {labels}, assigned: {assigned_labels}")
    iou += len(set(labels) & set(assigned_labels)) / len(set(labels) | set(assigned_labels)) \
        if len(set(labels) | set(assigned_labels)) \
        else 1
iou = iou / len(validation_data)
print(f"Average intersection over union: {iou}")

Sentence 0: Critical Thinking Essentials  Are my biases affecting how I examine the issue?    Am I using information that can be verified with reliable data?    Am I basing my position on what I KNOW to be the truth, or what I WANT to be the truth?    I might be wrong.  (A little humility goes a long way.) , labels: ['Doubt', 'Slogans'], assigned: []
Sentence 1: Trying to think of a single accomplishment..., labels: ['Exaggeration/Minimisation', 'Smears'], assigned: []
Sentence 2: Corporate needs you to find the difference between this picture and this picture  They're the same picture , labels: ['Thought-terminating cliché'], assigned: ['Presenting Irrelevant Data (Red Herring)']
Sentence 3: KYLE RITTENHOUSE ALL CHARGES NOT GUILTY, labels: ['Glittering generalities (Virtue)'], assigned: []
Sentence 4: Al Franken explains why America should tax the rich and build back better!   You know when a bridge collapses, a Mercedes falls just as fast as a Hyundai.  , labels: ['Appeal to authorit

In [114]:
# # Get probabilities for each cluster for the new sentence
# average_accuracy = 0
# for sentence, labels in validation_data:
#     new_vector = model.infer_vector(word_tokenize(sentence.lower()))
#     cluster_probabilities = gmm.predict_proba([new_vector])[0]
#     
#     # Threshold for considering a cluster
#     threshold = 0.9
#     
#     # Identify clusters above the threshold
#     selected_clusters = [i for i, prob in enumerate(cluster_probabilities) if prob > threshold]
#     cluster_names = [index_to_key[cluster] for cluster in selected_clusters]
#     average_accuracy += len(set(labels) & set(cluster_names))
#     print(cluster_names, labels, selected_clusters)
# average_accuracy /= len(validation_data)
# print(f"Average accuracy: {average_accuracy}")

In [18]:
list(enumerate(model.dv.index_to_key))

[(0, 'Black-and-white Fallacy/Dictatorship'),
 (1, 'Loaded Language'),
 (2, 'Glittering generalities (Virtue)'),
 (3, 'Thought-terminating cliché'),
 (4, 'Whataboutism'),
 (5, 'Slogans'),
 (6, 'Causal Oversimplification'),
 (7, 'Smears'),
 (8, 'Name calling/Labeling'),
 (9, 'Appeal to authority'),
 (10, 'Exaggeration/Minimisation'),
 (11, 'Repetition'),
 (12, 'Flag-waving'),
 (13, 'Appeal to fear/prejudice'),
 (14, 'Reductio ad hitlerum'),
 (15, 'Doubt'),
 (16, "Misrepresentation of Someone's Position (Straw Man)"),
 (17, 'Obfuscation, Intentional vagueness, Confusion'),
 (18, 'Bandwagon'),
 (19, 'Presenting Irrelevant Data (Red Herring)')]