In [7]:
# Import Required Libraries 
import pandas as pd
import csv
from nltk.stem import PorterStemmer
import os
import re
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
from pprint import pprint
import nltk
from collections import defaultdict

In [8]:
# Import stopwords for this corpus fron the file TIME.STP
stopwords_file = open("TIME.STP", "r")
stopwords = stopwords_file.read().splitlines()
stop_words = list(filter(None, stopwords))
stopwords_file.close()
print(stop_words)



In [9]:
def import_dataset():
    """
    This function import all the articles in the TIME corpus,
    returning list of lists where each sub-list contains all the
    terms present in the document as a string.
    """
    articles = []
    with open('TIME.ALL', 'r') as f:
        tmp = []
        for row in f:
            if row.startswith("*TEXT"):
                if tmp != []:
                    articles.append(' '.join(tmp))
                tmp = []
            else:
                row = re.sub(r'[^a-zA-Z\s]+', '', row)
                tmp += row.split()
        
    return articles

def preprocessing(corpus):
    """
    This function preprocess the dataset by firstly tokenizing the terms, and then drop any term 
    that belongs to the stopwords set, and finally apply stemming to reduce the variations of terms
    """
    ps = PorterStemmer()
    preprocessed_docs = []
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for doc in corpus :
        tokens = word_tokenize(doc)
        cleaned_text = []
        for token in tokens :
            if token not in stop_words and token not in punctuations:
                cleaned_text.append(ps.stem(token))
        preprocessed_docs.append(' '.join(cleaned_text))
    
    return preprocessed_docs

In [10]:
def make_inverted_index(corpus):
    """
    This function builds an inverted index as an hash table (dictionary)
    where the keys are the terms and the values are ordered lists of
    docIDs containing the term.
    """
    index = defaultdict(set)
    for docid, doc in enumerate(corpus):
        for term in doc.split():
            index[term].add(docid)
    return index

In [11]:
def build_term_document_matrix(cleaned_corpus): 
    """
    This function builds the term-document matrix, where rows are the terms
    and columns are the documents, and each element in the matrix represents 
    the tf-idf weight of the corresponding term and document
    """
    
    index = make_inverted_index(cleaned_corpus)
    terms = list(index.keys())
    M = pd.DataFrame(np.zeros((len(terms), len(cleaned_corpus))), index = terms)
    for i in range(len(cleaned_corpus)): 
        tokens = cleaned_corpus[i]
        counter = Counter(tokens.split()) #Counter returns number of  at ith document
        for token in np.unique(tokens.split()):
            tf = counter[token]/len(cleaned_corpus[i])
            dft = len(index[token])
            idf = np.log(len(cleaned_corpus)/(dft+1))
            M.loc[token][i] = tf*idf
    return M, terms

In [12]:
def LSI(Matrix, k=30):
    """
    This function decompose the term-document matrix into 3 matrices : 
    U, S, VT, and then select the the kth concepts from each matrix
    """
    U, S, VT = np.linalg.svd(M)
    #print(f"U shape : {U.shape}, S shape : {S.shape}, V shape : {VT.shape}")
    U_k = U[:,0:k]
    S_k = np.diag(S)[0:k,0:k]
    VT_k = VT[0:k,:]
    
    return [U_k, S_k, VT_k]

In [13]:
def query_to_doc_vector(query, terms):
    """
    This function represnts the provides query as a vector in terms dimensions, 
    where the component of query vector is 1 if query contains that term and 
    0 otherwise 
    """
    q = np.zeros(len(terms))
    for words in query:
        for term in words.split():
            try:
                q[terms.index(term)] += 1
            except:
                pass 
    return q

In [14]:
def cosine_similarity(v1, v2):
    """
    This function computes the cosine similarity between two vectors
    """
    len_v1 = np.linalg.norm(v1)
    len_v2 = np.linalg.norm(v2)
    return np.dot(v1, v2)/(len_v1*len_v2)

In [15]:
def answer_query(query,Mat,terms, top_k = 10):
    """
    This function answers to the given query by computing the similarity 
    between the embeded query and each document vector, and returns the 
    top_k ranked documents after sorting the scores
    """
    query = preprocessing(query)
    query_vec = query_to_doc_vector(query,terms)
    qhat = np.dot(np.linalg.inv(Mat[1]), Mat[0].transpose()).dot(query_vec)
    N = Mat[2].shape[1]
    scores = [[d+1,cosine_similarity(qhat, Mat[2][:,d])] for d in range(N)]
    ranked_docs = sorted(scores, key=lambda x : x[1], reverse=True)
    #for i in range(top_k):
    #    print(f"doc_id : {ranked_docs[i][0]}, score : {ranked_docs[i][1]} \n {corpus[ranked_docs[i][0]]}")
    return ranked_docs[:top_k]

In [16]:
corpus = import_dataset()
cleaned_corpus = preprocessing(corpus)
M, terms = build_term_document_matrix(cleaned_corpus)

In [21]:
M.shape

(14952, 422)

In [11]:
#69  70 100 115 121 139 159 194 210 224 234 309 379 388
query_1 = ["THE BAATH (RENAISSANCE) PARTY FOUNDED BY MICHEL AFLAK, WHICH HAS GAINED CONTROL OF SYRIA AND IRAQ AND AIMS TO UNITE ALL ARAB COUNTRIES"]
for k in [50,80,100,120,150,180,200,215,250,275,300,325,350,380,400]:
    M_decomposition = LSI(M, k)
    relevant_docs_1 = answer_query(query_1,M_decomposition,terms,13)
    print(f"for k = {k} : {relevant_docs_1}")

for k = 50 : [[377, 0.9777421168969819], [386, 0.9720338207071022], [70, 0.9561281422922197], [210, 0.9529532404578928], [309, 0.9177081018135], [86, 0.8974405998190969], [55, 0.8919839489732329], [194, 0.8914135287894625], [224, 0.8895380762585576], [35, 0.836226567228647], [139, 0.8295159506200646], [115, 0.8120284326034406], [233, 0.8050506699891491]]
for k = 80 : [[377, 0.9766773134545117], [70, 0.9549281922100904], [386, 0.9441228459339486], [309, 0.906988309503039], [55, 0.8922797280368308], [86, 0.8881257374519174], [210, 0.8755884278511044], [224, 0.8166179138839602], [115, 0.7631892676229806], [233, 0.7407728459469918], [35, 0.7347276541705084], [194, 0.696428520089847], [159, 0.6613950982658903]]
for k = 100 : [[377, 0.9657972171010308], [386, 0.9323428188380355], [309, 0.8550554279977609], [70, 0.8296994438280864], [210, 0.8009404822748215], [224, 0.7265008737849402], [86, 0.723402463789674], [115, 0.7101862015426305], [194, 0.6840208178289314], [159, 0.6408053590763798], [5

In [22]:
#69  70 100 115 121 139 159 194 210 224 234 309 379 388
query_1 = ["THE BAATH (RENAISSANCE) PARTY FOUNDED BY MICHEL AFLAK, WHICH HAS GAINED CONTROL OF SYRIA AND IRAQ AND AIMS TO UNITE ALL ARAB COUNTRIES"]
M_decomposition = LSI(M, 215)
relevant_docs_1 = answer_query(query_1,M_decomposition,terms,13)
print(relevant_docs_1)

[[377, 0.9074748457722953], [386, 0.7648667535593023], [309, 0.5648373281418607], [194, 0.5316742024010941], [115, 0.49270821957710237], [159, 0.4702934871295355], [210, 0.3849730137900559], [100, 0.35748794415113494], [121, 0.3377839712475548], [70, 0.33242266716334884], [234, 0.326790886761082], [139, 0.291839556073581], [224, 0.21395117397533003]]


In [12]:
#6  257 268 288 304 308 323 324 326 334
query_2 = ["CEREMONIAL SUICIDES COMMITTED BY SOME BUDDHIST MONKS IN SOUTH VIET NAM AND WHAT THEY ARE SEEKING TO GAIN BY SUCH ACTS ."]
for k in [50,80,100,120,150,180,200,215,250,275,300,325,350,380,400]:
    M_decomposition = LSI(M, k)
    relevant_docs_2 = answer_query(query_2,M_decomposition,terms,9)
    print(f"for k = {k} : {relevant_docs_2}")

for k = 50 : [[334, 0.9428985592961341], [376, 0.9172049009453587], [349, 0.9171867588286923], [395, 0.9095365419865311], [304, 0.900595080959327], [383, 0.9003214538549599], [171, 0.8994639581138159], [359, 0.8899437748862911], [370, 0.8875063584816165]]
for k = 80 : [[304, 0.8625473488384942], [334, 0.8540204752060048], [326, 0.8315526773937816], [257, 0.8250944094280134], [308, 0.8097684231128645], [268, 0.7732627744935038], [323, 0.766370586076009], [349, 0.7579070207392442], [370, 0.7443446599106083]]
for k = 100 : [[304, 0.8483076788714345], [326, 0.8156722154316772], [257, 0.8135264656345508], [308, 0.8113400127977787], [268, 0.7764195853340662], [323, 0.7658300355835644], [334, 0.748791984459699], [288, 0.7025232725975324], [324, 0.6991147010777615]]
for k = 120 : [[304, 0.8071370242435357], [308, 0.7947552650477477], [257, 0.7823255844752252], [268, 0.7750759842456743], [326, 0.7649853943184325], [323, 0.7613039877881206], [288, 0.7088496909304034], [324, 0.6979555151375095], 

In [13]:
#39  22  73 173 189 219 265 277 360 396
query_3 = ["COALITION GOVERNMENT TO BE FORMED IN ITALY BY THE LEFT-WING SOCIALISTS, THE REPUBLICANS, SOCIAL DEMOCRATS, AND CHRISTIAN DEMOCRATS ."]
for k in [50,80,100,120,150,180,200,215,250,275,300,325,350,380,400]:
    M_decomposition = LSI(M, k)
    relevant_docs_3 = answer_query(query_3,M_decomposition,terms,8)
    print(f"for k = {k} : {relevant_docs_3}")

for k = 50 : [[277, 0.9690738561171122], [360, 0.9677185518900955], [219, 0.9562090753927571], [189, 0.9498111883977528], [394, 0.9488746312717283], [22, 0.946052582445059], [173, 0.9443364868500361], [73, 0.9296586946639684]]
for k = 80 : [[360, 0.947027677143237], [277, 0.9426860988633118], [394, 0.926732113481363], [189, 0.9260552192214182], [219, 0.9248988078538886], [173, 0.9162194784719762], [22, 0.9119076867100143], [73, 0.9040247268171012]]
for k = 100 : [[360, 0.927733274712625], [277, 0.9160257748702039], [189, 0.8926305592261845], [394, 0.8880625010680397], [219, 0.8851388713174745], [173, 0.8718061673527524], [22, 0.8547849641364949], [73, 0.8389855094784652]]
for k = 120 : [[360, 0.8900695277081917], [277, 0.8752089101728688], [189, 0.8352155436861798], [394, 0.8194077280400051], [219, 0.8136854563414126], [173, 0.7896068666715427], [22, 0.7559398031962784], [265, 0.7482251756373854]]
for k = 150 : [[277, 0.8244424024429657], [360, 0.7904256619513845], [394, 0.763624054297

In [23]:
#39  22  73 173 189 219 265 277 360 396
query_3 = ["COALITION GOVERNMENT TO BE FORMED IN ITALY BY THE LEFT-WING SOCIALISTS, THE REPUBLICANS, SOCIAL DEMOCRATS, AND CHRISTIAN DEMOCRATS ."]
M_decomposition = LSI(M, 215)
relevant_docs_3 = answer_query(query_3,M_decomposition,terms,8)
print(relevant_docs_3)

[[277, 0.7656766098060409], [360, 0.7491591189751454], [394, 0.6779197006972426], [265, 0.5678370396247605], [219, 0.4940555543654217], [189, 0.44341632087842253], [134, 0.40568774969918425], [22, 0.39216211767542447]]


In [14]:
#58   1  47  54  89 135 157 247 254
query_4 = ["OTHER NATIONS POSSESSING U.S . POLARIS MISSILES FOR THEIR NUCLEAR SUBMARINE FLEETS ."]
for k in [50,80,100,120,150,180,200,215,250,275,300,325,350,380,400]:
    M_decomposition = LSI(M, k)
    relevant_docs_4 = answer_query(query_4,M_decomposition,terms,8)
    print(f"for k = {k} : {relevant_docs_4}")

for k = 50 : [[89, 0.923037855947384], [135, 0.9066010208697378], [402, 0.8517563843042012], [247, 0.8439454352568363], [254, 0.8378371050563296], [1, 0.8096410053567077], [228, 0.7746236254489764], [157, 0.756843875915618]]
for k = 80 : [[89, 0.9096319553185308], [135, 0.905221778776799], [247, 0.8522000245453968], [254, 0.8484324516733808], [402, 0.8257708396099401], [1, 0.7942639147141299], [157, 0.7344619223234198], [228, 0.6885085254169833]]
for k = 100 : [[135, 0.934848831505392], [89, 0.9125662617572239], [254, 0.9002673083226214], [247, 0.8953930749346178], [402, 0.8086208271766215], [1, 0.8024297980290744], [157, 0.7529637776929307], [228, 0.7088447923864942]]
for k = 120 : [[135, 0.9474573608153324], [89, 0.9294467398834597], [254, 0.9235577020520589], [247, 0.9116755682901437], [402, 0.8371135430307706], [1, 0.820160405637537], [228, 0.7795539261501739], [157, 0.7758310173482269]]
for k = 150 : [[135, 0.9405219893150767], [89, 0.92385229012786], [254, 0.8983648309271522], [2

**We can deduce that k = 215 gives better results, and as an evaluation metric , R-precision can be computed.** 


In [15]:
#1  268 288 304 308 323 326 334
query_5 = ["KENNEDY ADMINISTRATION PRESSURE ON NGO DINH DIEM TO STOP SUPPRESSING THE BUDDHISTS ."]
M_decomposition = LSI(M, 215)
relevant_docs_5 = answer_query(query_5,M_decomposition,terms,7)
print(relevant_docs_5)

[[257, 0.7820043358784996], [326, 0.7796756233534996], [304, 0.7441730631143993], [308, 0.7372272764039408], [323, 0.6900881507169259], [334, 0.6862077338609285], [370, 0.6227771382102607]]


In [16]:
#12  61 155 156 242 269 339 358
query_6 = ["CONTROVERSY BETWEEN INDONESIA AND MALAYA ON THE PROPOSED FEDERATION OF MALAYSIA, WHICH WOULD UNITE FIVE TERRITORIES ."]
M_decomposition = LSI(M, 215)
relevant_docs_6 = answer_query(query_6,M_decomposition,terms,7)
print(relevant_docs_6)

[[61, 0.6785103101149405], [358, 0.6234520083076952], [156, 0.60534689432005], [155, 0.5143008007463842], [303, 0.43702222628418297], [269, 0.36305323227580283], [339, 0.3591404003504791]]


In [18]:
#45  58  71 148 365 381
query_7 = ["BACKGROUND OF THE NEW PRIME MINISTER OF GREAT BRITAIN ."]
M_decomposition = LSI(M, 215)
relevant_docs_7 = answer_query(query_7,M_decomposition,terms,5)
print(relevant_docs_7)

[[20, 0.5263970914610074], [32, 0.5046019131011285], [229, 0.4907789595744149], [379, 0.45312899344430735], [262, 0.4444543443893466]]


In [19]:
#27 272 295 306
query_8 = ["BRITISH PROPOSAL FOR NEW HIGH LEVEL NEGOTIATIONS WITH RUSSIA OR A FOUR-POWER SUMMIT MEETING ."]
M_decomposition = LSI(M, 215)
relevant_docs_8 = answer_query(query_8,M_decomposition,terms,3)
print(relevant_docs_8)

[[151, 0.4218138316235267], [111, 0.41488162259210043], [306, 0.31681114968467844]]


**Let's compute R-Precision for each query :**

query_1 : $RP_1 = \frac{r}{R} = \frac{11}{13} = 0.84615$ 

query_2 : $RP_2 = \frac{r}{R} = \frac{8}{9} = 0.88888$ 

query_3 : $RP_3 = \frac{r}{R} = \frac{6}{8} = 0.75$ 

query_4 : $RP_4 = \frac{r}{R} = \frac{6}{8} = 0.75$ 

query_5 : $RP_5 = \frac{r}{R} = \frac{5}{7} = 0.71428$ 

query_6 : $RP_6 = \frac{r}{R} = \frac{6}{7} = 0.85714$ 

query_7 : $RP_7 = \frac{r}{R} = \frac{0}{5} = 0.0$ 

query_8 : $RP_8 = \frac{r}{R} = \frac{1}{3} = 0.33333$ 

And the average R-precision : 

$$ARP = \frac{1}{n} \sum_n RP_n$$ 

$$\Rightarrow ARP = \frac{1}{8} \sum_n RP_n = 0.64247$$ 
