In [33]:
%load_ext autoreload
%autoreload 2
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import NearestNeighbors 
from collections import defaultdict

from model import SpecialDataLoader

# special_dataloader = SpecialDataLoader(filepath="../dataset/dataset.csv")
special_dataloader = SpecialDataLoader(filepath="../dataset/dataset_subset10000.csv", testing_split=0.2)
def c_to_d(c):
    return special_dataloader.corpus_to_id[c]

def d_to_c(d):
    return special_dataloader.id_to_corpus[d]

# split the data into train, test, and validation 
from sklearn.model_selection import train_test_split

# split train as 50%, val as 25%, test as 25% 
def get_training_splits(data):
    X_train, X_test = train_test_split(
        data.data, test_size = 0.25, random_state=487, shuffle=True 
    )

    X_train, X_val = train_test_split(
        X_train, test_size = 0.33, random_state=487, shuffle=True
    )
    return X_train, X_val, X_test

# X_train, X_val, X_test = get_training_splits(data.data)

special_X_train, special_X_val, special_X_test = get_training_splits(special_dataloader)


['/home/alex/class/eecs487/eecs487-finalproject/model', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/alex/class/eecs487/eecs487-finalproject/.venv/lib/python3.10/site-packages', '..', '..', '..', '..', '..']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
There are 9886 training and 2398 testing examples. Testing ratio = 0.19521328557473136 vs 0.2
Performed tf-idf
Special dataloader done processing.
Keep in mind the data processed is in order, so you might want to shuffle them.


In [34]:
special_dataloader.matrix

<12284x8233 sparse matrix of type '<class 'numpy.float64'>'
	with 93811 stored elements in Compressed Sparse Row format>

In [35]:
my_id = 1
corpus_id = special_dataloader.id_to_corpus[my_id]
print(my_id, special_dataloader.corpus[corpus_id])

dup_id = special_dataloader.duplicate[my_id][0]
print(dup_id,special_dataloader.corpus[special_dataloader.id_to_corpus[dup_id]])


1 When do I use "can" or "could"?
4294 Correct usage of "Could" and "Can"


In [36]:
# Find similar ones
from sklearn.metrics.pairwise import cosine_similarity
matrix = special_dataloader.matrix

cos_similarities = cosine_similarity(matrix[special_dataloader.id_to_corpus[my_id]], matrix)
print(cos_similarities)
similar_doc_indices = cos_similarities.argsort()[0][::-1]
def get_rank(corpus_id):
    corpus_index = special_dataloader.id_to_corpus[corpus_id]
    for rank, doc_index in enumerate(similar_doc_indices):
        if doc_index == corpus_index:
            return rank

print(f"MY target of {dup_id} is rank {get_rank(dup_id)} of {len(similar_doc_indices)}")

print(similar_doc_indices)
for i in range(1, 10):
    near_corpus_id = similar_doc_indices[i]
    near_id = special_dataloader.corpus_to_id[similar_doc_indices[i]]
    print(near_id, special_dataloader.corpus[near_corpus_id])

[[0.         0.12317498 0.18075553 ... 0.         0.         0.        ]]
MY target of 4294 is rank 6 of 12284
[  68 3184  228 ... 7655 7653    0]
125481 Is it could or can?
5902 When should we use "can", "could", "will", "would"?
253426 "Empty use" of can and could
182383 could be or could have been stolen?
270478 How do I choose between "can" and "could"?
4294 Correct usage of "Could" and "Can"
58756 When do you use "talked" and "spoke"?
5552 When do I use "me" and when "I"?
12458 How to use would or could in English?


In [37]:
# Get the MRR and accuracy


sections = [
    ("Training", lambda x: not special_dataloader.is_doc_testing[x]),
    ("Testing", lambda x: special_dataloader.is_doc_testing[x])
]

for name, critera in sections:
    mrr = 0.0
    num_correct = 0
    used = 0
    n = 0
    for this_doc_id in special_dataloader.duplicate.keys():
        if not critera(this_doc_id): # only iterate over training or testing IDs
            continue
        this_corpus_id = d_to_c(this_doc_id)

        duplicates = special_dataloader.duplicate[this_doc_id]

        # This doc_id has some duplicates
        cos_similarities = cosine_similarity(matrix[this_corpus_id], matrix)
        similar_corpus_indices = cos_similarities.argsort()[0][::-1]

        # Get the rank of the closest duplicate
        rank = -1
        similar_doc_index = -1
        for rank, similar_corpus_index in enumerate(similar_corpus_indices):
            similar_doc_index = c_to_d(similar_corpus_index)
            if similar_doc_index in duplicates:
                break

        # Note: normally, the top document (rank=0) is the question asked, and then duplicates follow, 
        # but sometimes the duplicate can take rank 0. If that's so, just account for that here:
        if rank == 0:
            rank = 1

        if rank == 1:
            num_correct += 1
        mrr += 1.0 / rank
        n += 1
    # n = len(special_dataloader.duplicate.keys())
    mrr /= n
    acc = num_correct / n
    print(f"{name}: For {n} entries, MRR = {mrr}, Acc = {acc}")

Training: For 5966 entries, MRR = 0.20549472943479663, Acc = 0.15085484411666109
Testing: For 1429 entries, MRR = 0.19005409475919066, Acc = 0.13226032190342898


In [11]:
# Find the duplicate question to this one
# my_id = 1
# this_text = special_dataloader.corpus[d_to_c(my_id)]
# similar_text = "When should I use can or could?"

# text = similar_text
# Fit the data
def get_nearest_questions(text, n=5):
    row = special_dataloader.vectorizer.transform([text])

    cos_similarities = cosine_similarity(row, matrix)
    similar_corpus_indices = cos_similarities.argsort()[0][::-1]

    top_corpus = similar_corpus_indices[:n]
    top_responses = [(c_to_d(c), special_dataloader.corpus[c]) for c in top_corpus]
    return top_responses

my_text = "When do I use they instead of he or she?"
answers = get_nearest_questions(my_text)
print("My question", my_text)
for doc, text in answers:
    print(doc, text)

My question When do I use they instead of he or she?
38250 User: She, He, She or He, or They?
313 When do I use "I" instead of "me?"
3127 When to use & instead of "and"
154665 What to use instead of "discardation"?
26721 Anyone: ("they" or "he/she") why is it sometimes plural?
