In [24]:
import math
import os
import glob
import random
import nltk
import numpy as np
import math
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

lemmatizer = nltk.WordNetLemmatizer()

In [2]:
def load_datasets(parent_folder_name):
    dataset = []
    # Get a list of all items in the folder
    items = os.listdir(parent_folder_name)

    # Filter out folders from the list
    folder_list = [item for item in items if os.path.isdir(os.path.join(parent_folder_name, item))]
    for folder_name in folder_list:
        file_list = glob.glob(os.path.join(os.path.join(parent_folder_name, folder_name), "*"))

        # Iterate over each file
        for file_path in file_list:
            with open(file_path, "r") as file:
                content = file.read()
                dataset.append([content, folder_name])
    return dataset


def get_embedding(key, glove):
    if key in glove:
        return glove.get_vector(key)
    else:
        return np.zeros(300)  # Return a zero vector if the key is not found


def embed_dataset(dataset, glove):
    for tuple in dataset:
        tuple[0] = embed_document(tuple[0], glove)
    return dataset


def embed_document(document, glove):
    # Tokenize the sentence into words
    words = nltk.word_tokenize(document.lower())

    # Embed each word and calculate the average embedding
    embeddings = [get_embedding(word, glove) for word in words]
    return np.mean(embeddings, axis=0)


def get_dataset_elements_with_label(dataset, label):
    elements = []
    for elem in dataset:
        if (elem[1] == label):
            elements.append(elem)
    return elements

In [None]:
dataset = load_datasets("data-es4/data/20_NGs_400")
glove_file = "glove.6B/glove.6B.300d.txt"
glove = KeyedVectors.load_word2vec_format(glove_file, no_header=True)

In [11]:
dataset = embed_dataset(dataset, glove)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [112]:
random.seed(22)
labels_set = set([t[1] for t in dataset])
x_train_data = []
y_train_data = []
x_test_data = []
y_test_data = []
for label in labels_set:
    label_dataset = get_dataset_elements_with_label(dataset, label)
    train_set, test_set = train_test_split(label_dataset, test_size=0.1, random_state=19)
    for elem in train_set:
        if(isinstance(elem[0], np.ndarray)):
            x_train_data.append(elem[0])
            y_train_data.append(elem[1])
    for elem in test_set:
         if(isinstance(elem[0], np.ndarray)):
            x_test_data.append(elem[0])
            y_test_data.append(elem[1])

In [113]:
def get_features_with_same_label(features, labels, label):
    correct_features = []
    for i, elem in enumerate(labels):
        if (elem == label):
            correct_features.append(features[i])
    return correct_features


def get_features_with_different_label(features, labels, label):
    correct_features = []
    for i, elem in enumerate(labels):
        if (elem != label):
            correct_features.append(features[i])
    return correct_features


def train_rocchio(features, labels, beta, gamma):
    possible_labels = set(labels)
    model = []
    for label in possible_labels:
        positive_examples = get_features_with_same_label(features, labels, label)
        negative_examples = get_features_with_different_label(features, labels, label)
        positive_examples = beta * np.mean(positive_examples, axis=0)
        negative_examples = gamma * np.mean(negative_examples, axis=0)
        model.append(((positive_examples - negative_examples), label))
    return model


def predict(elem, model):
    best_predict = None
    best_similarity = -1
    for label in model:
        similarity = cosine_similarity(elem.reshape(1, -1), label[0].reshape(1, -1))[0]
        if (best_similarity == -1):
            best_similarity = similarity
            best_predict = label[1]
        elif (similarity > best_similarity):
            best_similarity = similarity
            best_predict = label[1]
    return best_predict, best_similarity

In [127]:
model = train_rocchio(x_train_data, y_train_data, 16, 4)
hit = 0

for i, elem in enumerate(x_test_data):
    predicted_value, sim = predict(elem, model)
    if (predicted_value == y_test_data[i]):
        hit += 1
    else:
        print("ERROR, EXPECTED", y_test_data[i], "GOT", predicted_value, "WITH SIM: ", sim)
if(hit==0):
    prec=0
else:
    perc=(hit / len(x_test_data))*100
print("Accuracy:", perc, "%", "N_tests:",len(x_test_data),"Hits:",hit)

ERROR, EXPECTED misc.forsale GOT comp.windows.x WITH SIM:  [0.91607043]
ERROR, EXPECTED misc.forsale GOT talk.politics.guns WITH SIM:  [0.97130805]
ERROR, EXPECTED comp.windows.x GOT sci.crypt WITH SIM:  [0.98133192]
ERROR, EXPECTED comp.windows.x GOT comp.os.ms-windows.misc WITH SIM:  [0.98372821]
ERROR, EXPECTED talk.religion.misc GOT sci.crypt WITH SIM:  [0.97989356]
ERROR, EXPECTED rec.sport.hockey GOT rec.sport.baseball WITH SIM:  [0.9009234]
ERROR, EXPECTED rec.sport.hockey GOT soc.religion.christian WITH SIM:  [0.97183792]
ERROR, EXPECTED soc.religion.christian GOT talk.politics.misc WITH SIM:  [0.9747526]
ERROR, EXPECTED talk.politics.guns GOT talk.politics.mideast WITH SIM:  [0.97036089]
ERROR, EXPECTED comp.os.ms-windows.misc GOT comp.windows.x WITH SIM:  [0.93806999]
ERROR, EXPECTED comp.os.ms-windows.misc GOT sci.crypt WITH SIM:  [0.97973088]
ERROR, EXPECTED sci.electronics GOT comp.windows.x WITH SIM:  [0.95299052]
ERROR, EXPECTED rec.sport.baseball GOT sci.med WITH SIM:  

In [131]:
SIMILARITY_TRESHOLD = 0.5


def get_features_with_same_label(features, labels, label):
    correct_features = []
    for i, elem in enumerate(labels):
        if (elem == label):
            correct_features.append(features[i])
    return correct_features


def calc_avg_similarity(positives, feature):
    sims = []
    for elem in positives:
        sims.append(cosine_similarity(elem.reshape(1, -1), feature.reshape(1, -1))[0])
    return np.mean(sims)


def get_features_with_nears_positives(features, positives, labels, label):
    correct_features = []
    for i, elem in enumerate(labels):
        if (elem != label):
            sim = calc_avg_similarity(positives,features[i])
            if (sim >= SIMILARITY_TRESHOLD):
                correct_features.append(features[i])
    return correct_features


def train_rocchio_with_near_positives(features, labels, beta, gamma):
    possible_labels = set(labels)
    model = []
    for label in possible_labels:
        positive_examples = get_features_with_same_label(features, labels, label)
        near_positives_examples = get_features_with_nears_positives(features, positive_examples, labels, label)
        positive_examples = beta * np.mean(positive_examples, axis=0)
        near_positives_examples = gamma * np.mean(near_positives_examples, axis=0)
        model.append(((positive_examples - near_positives_examples), label))
    return model

In [132]:
model = train_rocchio_with_near_positives(x_train_data, y_train_data, 16, 4)
hit = 0
for i, elem in enumerate(x_test_data):
    predicted_value, sim = predict(elem, model)
    if (predicted_value == y_test_data[i]):
        hit += 1
    else:
        print("ERROR, EXPECTED", y_test_data[i], "GOT", predicted_value, "WITH SIM: ", sim)

if(hit==0):
    prec=0
else:
    perc=(hit / len(x_test_data))*100
print("Accuracy:", perc, "%", "N_tests:",len(x_test_data),"Hits:",hit)

ERROR, EXPECTED misc.forsale GOT comp.windows.x WITH SIM:  [0.91604151]
ERROR, EXPECTED misc.forsale GOT talk.politics.guns WITH SIM:  [0.97126263]
ERROR, EXPECTED comp.windows.x GOT sci.crypt WITH SIM:  [0.98127539]
ERROR, EXPECTED comp.windows.x GOT comp.os.ms-windows.misc WITH SIM:  [0.9837652]
ERROR, EXPECTED talk.religion.misc GOT sci.crypt WITH SIM:  [0.97990536]
ERROR, EXPECTED rec.sport.hockey GOT rec.sport.baseball WITH SIM:  [0.90095107]
ERROR, EXPECTED rec.sport.hockey GOT soc.religion.christian WITH SIM:  [0.97187181]
ERROR, EXPECTED soc.religion.christian GOT talk.politics.misc WITH SIM:  [0.97473513]
ERROR, EXPECTED talk.politics.guns GOT talk.politics.mideast WITH SIM:  [0.9703178]
ERROR, EXPECTED comp.os.ms-windows.misc GOT comp.windows.x WITH SIM:  [0.93814874]
ERROR, EXPECTED comp.os.ms-windows.misc GOT sci.crypt WITH SIM:  [0.97969467]
ERROR, EXPECTED sci.electronics GOT comp.windows.x WITH SIM:  [0.95306575]
ERROR, EXPECTED rec.sport.baseball GOT sci.med WITH SIM:  