In [78]:
import os
import glob
import random
import nltk
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

lemmatizer = nltk.WordNetLemmatizer()

In [50]:
def load_datasets(parent_folder_name):
    dataset=[]
    # Get a list of all items in the folder
    items = os.listdir(parent_folder_name)

    # Filter out folders from the list
    folder_list = [item for item in items if os.path.isdir(os.path.join(parent_folder_name, item))]
    for folder_name in folder_list:
        file_list = glob.glob(os.path.join(os.path.join(parent_folder_name,folder_name), "*"))

        # Iterate over each file
        for file_path in file_list:
            with open(file_path, "r") as file:
                content = file.read()
                dataset.append([content,folder_name])
    return dataset


def get_embedding(key,glove):
    if key in glove:
        return glove.get_vector(key)
    else:
        return np.zeros(300)  # Return a zero vector if the key is not found

def embed_dataset(dataset,glove):
    for tuple in dataset:
        tuple[0]=embed_document(tuple[0],glove)
    return dataset

def embed_document(document,glove):
    # Tokenize the sentence into words
    words = nltk.word_tokenize(document.lower())

    # Embed each word and calculate the average embedding
    embeddings = [get_embedding(word, glove) for word in words]
    return np.mean(embeddings, axis=0)
def get_dataset_elements_with_label(dataset,label):
    elements=[]
    for elem in dataset:
        if(elem[1]==label):
            elements.append(elem)
    return elements

In [42]:
dataset=load_datasets("data-es4/data/20_NGs_400")
glove_file = "glove.6B/glove.6B.300d.txt"
glove = KeyedVectors.load_word2vec_format(glove_file, no_header=True)

In [43]:
dataset=embed_dataset(dataset,glove)

  return glove.word_vec(key)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [125]:
random.seed(22)
labels_set=set([t[1] for t in dataset])
x_train_data= []
y_train_data= []
x_test_data= []
y_test_data= []
for label in labels_set:
    label_dataset=get_dataset_elements_with_label(dataset,label)
    train_set, test_set = train_test_split(label_dataset, test_size=0.1, random_state=21)
    for elem in train_set:
        x_train_data.append(elem[0])
        y_train_data.append(elem[1])
    for elem in test_set:
        x_test_data.append(elem[0])
        y_test_data.append(elem[1])

In [126]:
def get_features_with_same_label(features,labels,label):
    correct_features=[]
    for i,elem in enumerate(labels):
        if(elem==label):
            correct_features.append(features[i])
    return correct_features

def get_features_with_different_label(features,labels,label):
    correct_features=[]
    for i,elem in enumerate(labels):
        if(elem!=label):
            correct_features.append(features[i])
    return correct_features

def train_rocchio(features,labels,beta,gamma):
    possible_labels=set(labels)
    model=[]
    for label in possible_labels:
        positive_examples=get_features_with_same_label(features,labels,label)
        negative_examples=get_features_with_different_label(features,labels,label)
        positive_examples=beta*np.mean(positive_examples,axis=0)
        negative_examples=gamma*np.mean(negative_examples,axis=0)
        model.append(((positive_examples - negative_examples),label))
    return model

def predict(elem,model):
    best_predict=None
    best_similarity=-1
    for label in model:
        similarity = cosine_similarity(elem.reshape(1, -1), label[0].reshape(1, -1))[0]
        if(best_similarity==-1):
            best_similarity=similarity
            best_predict=label[1]
        elif(similarity>best_similarity):
            best_similarity=similarity
            best_predict=label[1]
    return best_predict,best_similarity

In [127]:
model=train_rocchio(x_train_data,y_test_data,16,4)
hit=0
for i,elem in enumerate(x_test_data):
    predicted_value,sim=predict(elem,model)
    if(predicted_value==y_test_data[i]):
        hit+=1
    else:
        print("ERROR, EXPECTED",y_test_data[i],"GOT",predicted_value,"WITH SIM: ",sim)

print("Accuracy:",len(x_test_data)/hit,"%")

ERROR, EXPECTED rec.sport.hockey GOT talk.politics.guns WITH SIM:  [0.90316334]
ERROR, EXPECTED rec.sport.hockey GOT comp.sys.mac.hardware WITH SIM:  [0.96967356]
ERROR, EXPECTED talk.politics.guns GOT alt.atheism WITH SIM:  [0.98339126]
ERROR, EXPECTED talk.politics.guns GOT sci.electronics WITH SIM:  [0.97862446]
ERROR, EXPECTED sci.space GOT soc.religion.christian WITH SIM:  [0.81760462]
ERROR, EXPECTED sci.space GOT comp.windows.x WITH SIM:  [0.97825692]
ERROR, EXPECTED talk.politics.mideast GOT sci.med WITH SIM:  [0.98463395]
ERROR, EXPECTED talk.politics.mideast GOT comp.os.ms-windows.misc WITH SIM:  [0.80014708]
ERROR, EXPECTED talk.politics.misc GOT talk.religion.misc WITH SIM:  [0.97013493]
ERROR, EXPECTED talk.politics.misc GOT sci.crypt WITH SIM:  [0.96538702]
ERROR, EXPECTED rec.autos GOT comp.sys.mac.hardware WITH SIM:  [0.9674662]
ERROR, EXPECTED rec.autos GOT rec.motorcycles WITH SIM:  [0.89145358]
ERROR, EXPECTED soc.religion.christian GOT talk.politics.mideast WITH SIM