In [1]:
# import random
# import nltk
import glob
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
#from string import punctuation
#from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
#from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import lsi_main as lsi


The first 5 of 101 ['./data\\1878_vfa_season.txt', './data\\195556_scottish_cup.txt', './data\\2008_new_york_yankees_season.txt', './data\\6th_south_african_armoured_division.txt', './data\\alan_taylor_historian.txt']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AnthonyWynne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_path = "./data/"
files = glob.glob(f"{data_path}*.txt")
assert files, "No files found in data folder"
print(f"The first 5 of {len(files)}", files[:5])
# TODO load a web based dataset instead of local files

The first 5 of 101 ['./data\\1878_vfa_season.txt', './data\\195556_scottish_cup.txt', './data\\2008_new_york_yankees_season.txt', './data\\6th_south_african_armoured_division.txt', './data\\alan_taylor_historian.txt']


In [4]:
train_lines = [lsi.load_doc_lines(file)[0] for file in files]
print(f"First line of first file: {train_lines[0]}")

First 5 lines of first file: The 1878 Victorian Football Association season was the second season of the Australian rules football competition. The premiership was won by the Geelong Football Club, after it defeated Melbourne in a playoff match on 5 October. It was the club's first VFA premiership, and the first in a sequence of three consecutive premierships won from 1878 to 1880. Geelong was unbeaten during the year.


In [5]:
train_docs = lsi.process_docs(train_lines)
print(f"First lines  after processing: {train_docs[0]}")

First lines  after processing: victorian footbal associ season wa second season australian rule footbal competition premiership wa geelong footbal club defeat melbourn playoff match october wa club first vfa premiership first sequenc three consecut premiership geelong wa unbeaten dure year


In [6]:
vocab = []
for ll in train_docs:
    tt = ll.split()
    for ww in tt:
        if ww not in vocab:
            vocab.append(ww)
vocab[:10]

['victorian',
 'footbal',
 'associ',
 'season',
 'wa',
 'second',
 'australian',
 'rule',
 'competition',
 'premiership']

In [7]:
Xtrain = lsi.prepare_data(train_docs, vocab)
Xtrain[:10]

<10x1992 sparse matrix of type '<class 'numpy.float64'>'
	with 403 stored elements in Compressed Sparse Row format>

In [8]:
# 25 topics - this is the main parameter to tune
trunc_SVD_model = TruncatedSVD(n_components=25) 
approx_Xtrain = trunc_SVD_model.fit_transform(Xtrain)
print(f"Approximated Xtrain shape: {str(approx_Xtrain.shape)}")

Approximated Xtrain shape: (101, 25)


In [13]:
# These are the topics to search for. Adjusted from the original notebook to fit with the data
queries = ['Former American football defensive end in the National Football.', 'Bulgarian long-distance runner']
Top_n_documents = 10


In [14]:
# find the most similar documents to the query
for query in queries:
    encoded_query = lsi.preprocess_query(query, vocab)
    transformed_query = trunc_SVD_model.transform(encoded_query)
    similarities = cosine_similarity(approx_Xtrain, transformed_query)
    indexes = np.argsort(similarities.flat)[::-1]

    print('\n' + 'Query: ' + query)
    for i in range(Top_n_documents):
        print(f"Top {str(i + 1)} result:")
        print(f"Document ID: {str(indexes[i])}")
        print(train_lines[indexes[i]])



Query: Former American football defensive end in the National Football.
Top 1 result:
Document ID: 26
David Alan Lindstrom (born November 16, 1954) is a former American football defensive end in the National Football League, playing eight seasons (1978–1986) for the Kansas City Chiefs. Drafted by the San Diego Chargers in 1977, Lindstrom was inducted into both the Boston University Hall of Fame and Massachusetts High School Hall of Fame in 1993.
Top 2 result:
Document ID: 30
Douglas Alan Berry (born June 3, 1957) is a Canadian former professional ice hockey player, a centreman in the World Hockey Association (WHA) and the National Hockey League (NHL).
Top 3 result:
Document ID: 62
Manuel Jorge Aranda da Silva, a national of Mozambique, is a former minister in the Mozambican Government, and a former senior United Nations official with a background in the World Food Programme.
Top 4 result:
Document ID: 0
The 1878 Victorian Football Association season was the second season of the Austra

In [12]:
re_ID = [[]]
AllRecall = []
AllPrecision = []
AllF1measure = []

for j, query in enumerate(queries):
    # retrieval
    encoded_query = lsi.preprocess_query(query, vocab)
    transformed_query = trunc_SVD_model.transform(encoded_query)
    similarities = cosine_similarity(approx_Xtrain, transformed_query)

    # rank the index
    indexes = np.argsort(similarities.flat)[::-1]

    # Mark the relevant index
    re_mark = []
    for i in range(len(indexes)):
        if (indexes[i] + 1) in re_ID[j]:
            re_mark.append(1)
        else:
            re_mark.append(0)
    print(re_mark)

    # compute Recall, Precision, F1-measure
    Recall, Precision, F1measure = lsi.compute_R_P_F1(re_mark=re_mark,
                                                    QuRe_ID=re_ID[j])

    print('\n' + 'Query%d: ' % (j + 1) + query)
    for i in range(10):
        print(
            f"Top {str(i + 1)}" + ' result: ID%d ' % (indexes[i] + 1),
            train_lines[indexes[i]],
        )
    Recall = np.array(Recall)
    Precision = np.array(Precision)
    F1measure = np.array(F1measure)
    print(re_mark)
    print("Recall@1~10: ", np.around(Recall[:10], 2))
    print("Precision@1~10: ", np.around(Precision[:10], 2))
    print("F1measure@1~10: ", np.around(F1measure[:10], 2))

    # save
    AllRecall.append(Recall)
    AllPrecision.append(Precision)
    AllF1measure.append(F1measure)

    # plot R/P curve
    x_axis = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    y_axis = lsi.compute_RP_yaxis(Precision=Precision, Recall=Recall)
    plt.plot(x_axis,
                y_axis,
                '-bo',
                color="purple",
                label="Query%d" % (j + 1))
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Standard Recall/Precision Curves')
    plt.legend()
    plt.show()


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


IndexError: list index out of range

In [None]:

# compute average Recall, average Precision, average F1-measure
AllRecall = np.array(AllRecall)
AllPrecision = np.array(AllPrecision)
AllF1measure = np.array(AllF1measure)
AveRecall = (AllRecall[0] + AllRecall[1]) / 2
AvePrecision = (AllPrecision[0] + AllPrecision[1]) / 2
AveF1measure = (AllF1measure[0] + AllF1measure[1]) / 2

print("\nAverage Recall, average Precision, average F1-measure: ")
print("average Recall@1~10: ", np.around(AveRecall[:10], 2))
print("average Precision@1~10: ", np.around(AvePrecision[:10], 2))
print("average F1measure@1~10: ", np.around(AveF1measure[:10], 2))

# plot average R/P curve
x_axis = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
y_axis = lsi.compute_RP_yaxis(Precision=AvePrecision, Recall=AveRecall)
plt.plot(x_axis, y_axis, '-bo', color="blue", label="Average")
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel('average Recall')
plt.ylabel('average Precision')
plt.title('Standard Average Recall/Precision Curves')
plt.legend()
plt.show()
