In [1]:
%matplotlib inline
import numpy as np
import seaborn
import nltk
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report

In [2]:
# prepare corpus
corpus = []
for d in range(1400):
    f = open("./d/"+str(d+1)+".txt")
    corpus.append(f.read())
    f.close()
    
queries = []
for q in range(225):
    f = open("./q/"+str(q+1)+".txt")
    queries.append(f.read())
    f.close()
    
reference = []
for r in range(225):
    f = open("./r/"+str(r+1)+".txt")
    reference.append(list(map(int, f.read().replace('\n', ' ')[:-1].split(' '))))
    f.close()
    
q_len = []
for r in reference:
    q_len.append(len(r))

min_q = int(np.average(q_len)) #  overwrite min_q here shortest lenght of reference is 2
print("Lenght of query set to {} as thats the average lenght of reference".format(min_q))

Lenght of query set to 8 as thats the average lenght of reference


# BINARY REPRESENTATION

In [3]:
binary_vectorizer = CountVectorizer(binary=True)
binary_matrix = binary_vectorizer.fit_transform(corpus)
binary_queries_matrix = binary_vectorizer.transform(queries)

## cosine similarity

In [4]:
precisions = []
recalls = []
f_measures = []

for r in range(len(reference)-1):
    sim = np.array(cosine_similarity(binary_queries_matrix[r], binary_matrix)[0])
    retrieved = sim.argsort()[-min_q:][::-1]+1
    tp = 0
    fp = 0
    for doc in retrieved:
        if doc in reference[r]:
            tp += 1
        else:
            fp += 1
    fn = len(reference[r]) - tp
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    precisions.append(precision)
    recalls.append(recall)
    if tp == 0:
        f_measures.append(0)
    else:
        f_measures.append(2*(precision*recall)/(precision+recall))
        
print("            min   max   avg   mean")        
print("Precision: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(precisions), np.max(precisions), np.average(precisions),  np.median(precisions)))
print("Recalls  : {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(recalls), np.max(recalls), np.average(recalls),  np.median(recalls)))
print("F-Measure: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(f_measures), np.max(f_measures), np.average(f_measures),  np.median(f_measures)))

f = open("./bin_cos.csv", 'w')
for l in range(len(reference)-1):
    f.write("{},{},{}\n".format(precisions[l], recalls[l], f_measures[l]))
f.close()

            min   max   avg   mean
Precision: 0.000 0.750 0.172 0.125
Recalls  : 0.000 1.000 0.206 0.167
F-Measure: 0.000 0.800 0.174 0.154


## euclidean distance

In [5]:
precisions = []
recalls = []
f_measures = []

for r in range(len(reference)-1):
    sim = np.array(euclidean_distances(binary_queries_matrix[r], binary_matrix)[0])
    retrieved = sim.argsort()[:min_q]+1
    tp = 0
    fp = 0
    for doc in retrieved:
        if doc in reference[r]:
            tp += 1
        else:
            fp += 1
    fn = len(reference[r]) - tp
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    precisions.append(precision)
    recalls.append(recall)
    if tp == 0:
        f_measures.append(0)
    else:
        f_measures.append(2*(precision*recall)/(precision+recall))

        
print("            min   max   avg   mean")        
print("Precision: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(precisions), np.max(precisions), np.average(precisions),  np.median(precisions)))
print("Recalls  : {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(recalls), np.max(recalls), np.average(recalls),  np.median(recalls)))
print("F-Measure: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(f_measures), np.max(f_measures), np.average(f_measures),  np.median(f_measures)))

f = open("./bin_euc.csv", 'w')
for l in range(len(reference)-1):
    f.write("{},{},{}\n".format(precisions[l], recalls[l], f_measures[l]))
f.close()

            min   max   avg   mean
Precision: 0.000 0.250 0.011 0.000
Recalls  : 0.000 0.333 0.013 0.000
F-Measure: 0.000 0.286 0.011 0.000


# TERM FRENQUENCY

In [6]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(corpus)
count_queries_matrix = count_vectorizer.transform(queries)

## cosine similarity

In [13]:
precisions = []
recalls = []
f_measures = []

for r in range(len(reference)-1):
    sim = np.array(cosine_similarity(count_queries_matrix[r], count_matrix)[0])
    retrieved = sim.argsort()[-min_q:][::-1]+1
    tp = 0
    fp = 0
    for doc in retrieved:
        if doc in reference[r]:
            tp += 1
        else:
            fp += 1
    fn = len(reference[r]) - tp
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    precisions.append(precision)
    recalls.append(recall)
    if tp == 0:
        f_measures.append(0)
    else:
        f_measures.append(2*(precision*recall)/(precision+recall))

print("            min   max   avg   mean")        
print("Precision: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(precisions), np.max(precisions), np.average(precisions),  np.median(precisions)))
print("Recalls  : {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(recalls), np.max(recalls), np.average(recalls),  np.median(recalls)))
print("F-Measure: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(f_measures), np.max(f_measures), np.average(f_measures),  np.median(f_measures)))

f = open("./term_cos.csv", 'w')
for l in range(len(reference)-1):
    f.write("{},{},{}\n".format(precisions[l], recalls[l], f_measures[l]))
f.close()

            min   max   avg   mean
Precision: 0.000 0.875 0.145 0.125
Recalls  : 0.000 1.000 0.162 0.111
F-Measure: 0.000 0.778 0.143 0.118


## euclidean distance

In [8]:
precisions = []
recalls = []
f_measures = []

for r in range(len(reference)-1):
    sim = np.array(euclidean_distances(count_queries_matrix[r], count_matrix)[0])
    retrieved = sim.argsort()[:min_q]+1
    tp = 0
    fp = 0
    for doc in retrieved:
        if doc in reference[r]:
            tp += 1
        else:
            fp += 1
    fn = len(reference[r]) - tp
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    precisions.append(precision)
    recalls.append(recall)
    if tp == 0:
        f_measures.append(0)
    else:
        f_measures.append(2*(precision*recall)/(precision+recall))

print("            min   max   avg   mean")        
print("Precision: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(precisions), np.max(precisions), np.average(precisions),  np.median(precisions)))
print("Recalls  : {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(recalls), np.max(recalls), np.average(recalls),  np.median(recalls)))
print("F-Measure: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(f_measures), np.max(f_measures), np.average(f_measures),  np.median(f_measures)))

f = open("./term_euc.csv", 'w')
for l in range(len(reference)-1):
    f.write("{},{},{}\n".format(precisions[l], recalls[l], f_measures[l]))
f.close()

            min   max   avg   mean
Precision: 0.000 0.250 0.012 0.000
Recalls  : 0.000 0.333 0.013 0.000
F-Measure: 0.000 0.182 0.011 0.000


# TF-IDF

In [9]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_queries_matrix = tfidf_vectorizer.transform(queries)

## cosine similarity

In [10]:
precisions = []
recalls = []
f_measures = []

for r in range(len(reference)-1):
    sim = np.array(cosine_similarity(tfidf_queries_matrix[r], tfidf_matrix)[0])
    retrieved = sim.argsort()[-min_q:][::-1]+1
    tp = 0
    fp = 0
    for doc in retrieved:
        if doc in reference[r]:
            tp += 1
        else:
            fp += 1
    fn = len(reference[r]) - tp
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    precisions.append(precision)
    recalls.append(recall)
    if tp == 0:
        f_measures.append(0)
    else:
        f_measures.append(2*(precision*recall)/(precision+recall))

print("            min   max   avg   mean")        
print("Precision: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(precisions), np.max(precisions), np.average(precisions),  np.median(precisions)))
print("Recalls  : {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(recalls), np.max(recalls), np.average(recalls),  np.median(recalls)))
print("F-Measure: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(f_measures), np.max(f_measures), np.average(f_measures),  np.median(f_measures)))

f = open("./tfidf_cos.csv", 'w')
for l in range(len(reference)-1):
    f.write("{},{},{}\n".format(precisions[l], recalls[l], f_measures[l]))
f.close()

            min   max   avg   mean
Precision: 0.000 0.875 0.273 0.250
Recalls  : 0.000 1.000 0.322 0.286
F-Measure: 0.000 0.824 0.274 0.267


## euclidean distance

In [11]:
precisions = []
recalls = []
f_measures = []

for r in range(len(reference)-1):
    sim = np.array(euclidean_distances(tfidf_queries_matrix[r], tfidf_matrix)[0])
    retrieved = sim.argsort()[:min_q]+1
    tp = 0
    fp = 0
    for doc in retrieved:
        if doc in reference[r]:
            tp += 1
        else:
            fp += 1
    fn = len(reference[r]) - tp
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    precisions.append(precision)
    recalls.append(recall)
    if tp == 0:
        f_measures.append(0)
    else:
        f_measures.append(2*(precision*recall)/(precision+recall))

print("            min   max   avg   mean")        
print("Precision: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(precisions), np.max(precisions), np.average(precisions),  np.median(precisions)))
print("Recalls  : {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(recalls), np.max(recalls), np.average(recalls),  np.median(recalls)))
print("F-Measure: {:.3f} {:.3f} {:.3f} {:.3f}".format(np.min(f_measures), np.max(f_measures), np.average(f_measures),  np.median(f_measures)))

f = open("./tfidf_euc.csv", 'w')
for l in range(len(reference)-1):
    f.write("{},{},{}\n".format(precisions[l], recalls[l], f_measures[l]))
f.close()

            min   max   avg   mean
Precision: 0.000 0.750 0.229 0.250
Recalls  : 0.000 1.000 0.271 0.200
F-Measure: 0.000 0.769 0.230 0.200
