## DDW - Task 2: Index & Document Retrieval

První část kódu je věnována nevyužité snaze o úpravu dat pro získání lepších výsledků (čištění dat od zbytečných syntaktických chyb, lematizace, odstranění čísel atp. Věřím že přes zahušťování VSM (vector space model) matice vede cesta k lepším výsledkům..)

V druhé části reprezentuji texty ze složky .d (dokumenty) a .q (dotazy) pomocí VSM matic.
Na každém typu matice počítám nejrelevantnějších 10 dokumentů. 
Veškerá finální měření jsou v CSV souboru task2.csv. (Případně úplně dole v 'tab' proměnné.)

Komentář k výsledkům: 
Nepřekvapivě hůř dopadla euklidovská vzdálenost - příznakový prostor má pro tuto vzdálenost příliš mnoho dimenzí (vzdálenosti rostou i pro málo rozdílná data).
Z typů reprezentací dokumentů dopadla nejlíp nejsofistikovanější metoda s TF-IDF, v průměru skoro 3x lépe než pouhá pure term frequency reprezentace. TF-IDF narozdíl od pure term bere v potaz data ze všech dostupných dokumentů, tedy lze posoudit i relativní relativitu dokumentů k dotazu v rámci datasetu.

In [None]:
import glob
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# Reading filenames of all documents
all_documents = glob.glob("./d/*.txt")
print("Number of documents: "+str(len(all_documents)))

# Reading filnames of all queries
all_queries = glob.glob("./q/*.txt")
print("Number of queries: "+str(len(all_queries)))

# Reading filnames of all res
all_res = glob.glob("./r/*.txt")
print("Number of res: "+str(len(all_res)))

In [None]:
def remove_problem_chars(text):
    text = text.replace(".", " . ")
    text = text.replace(" .", ". ")
    text = text.replace(".  ", ". ")
    text = text.replace(".   ", ". ")
    text = text.replace("+", "")
    text = text.replace("-", "")
    text = text.replace("/", "")
    text = text.replace("\'", " ")
    text = text.replace(",", " , ")
    text = text.replace("=", " = ")
    text = text.replace("'", "")
    text = text.replace("aa", "a")
    return text

# Reading all documents to strings and storing them in list "documents"
documents = []
for d in all_documents:
    doc = open(d, "r")
    text = doc.read()
    text = remove_problem_chars(text)
    documents.append(text)
    
print("First document (content): \n"+documents[0]+"\n")
print("First document (length in words): "+str(len(documents[0].split(' '))))

# Reading all documents to strings and storing them in list "queries"
queries = []
for q in all_queries:
    doc = open(q, "r")
    text = doc.read()
    text = remove_problem_chars(text)
    queries.append(text)
    
corpus = documents + queries

## Creating sparse matrix of documents:
- each row is one document
- each column is word from corpus (combined words from all documents in this case)

In [None]:
# BINARY representation
# I am counting out stopwords in effort of reducing the matrix size
vect = CountVectorizer(stop_words = 'english',binary = True)
X_bin = vect.fit_transform(corpus)

# writing matrix into pandas data frame
df_freq = pd.DataFrame(X_bin.A, columns=vect.get_feature_names())

display(df_freq.iloc[:5,:100])
print(df_freq.info())

Even from such a small sample it is obvious that I should further 
reduce the feature space (number of columns) by lemmatization and even
eliminating numbers as they very probaby won't carry much value.
Lets try it:

In [None]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction import text 
import re

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]


In [None]:
# adding other unnecessary characters to stopwords
my_added_stopwords = ["'s", "-", "+", ",", ".", "..", "(", ")", "/", "'", '"', "`", "*", ":", "=", "?", "$"]
stop_words = text.ENGLISH_STOP_WORDS.union(my_added_stopwords)

#removing numbers
def no_numbers(tokens):
    r = re.sub('(\d)+', '', tokens.lower()) 
    return r

vect = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words = stop_words, binary = True, preprocessor=no_numbers)
X_bin = vect.fit_transform(corpus)

# writing matrix into pandas data frame
df_freq = pd.DataFrame(X_bin.A, columns=vect.get_feature_names())


k = 5
fe = 20
print("*BINARY REPRESENTATION*\nFirst %d documents and first %d words from corpus:" % (k, fe))
display(df_freq.iloc[:k,:fe])
print(df_freq.info())

In [None]:
vect = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words = stop_words, binary = False, preprocessor=no_numbers)
X_count = vect.fit_transform(corpus)

# writing matrix into pandas data frame
df_freq_term = pd.DataFrame(X_count.A, columns=vect.get_feature_names())

k = 5
fe = 100
print("*PURE TERM FREQUENCY*\nFirst %d documents and first %d words from corpus:" % (k, fe))
display(df_freq_term.iloc[:k,:fe])
print(df_freq_term.info())

In [None]:
import numpy as np
def getTfIdf(col):
    return col.isnull()

vect = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words = stop_words, preprocessor=no_numbers)
X_tfidf = vect.fit_transform(corpus)

df_tfidf = pd.DataFrame(X_tfidf.A, columns=vect.get_feature_names())

k = 5
fe = 30
print("*TF-IDF FREQUENCY*\nFirst %d documents and first %d words from corpus:" % (k, fe))
display(df_tfidf.iloc[:k,:fe])
print(df_tfidf.info())

# Original data for homework task:

In [None]:
corpus = []
for d in range(1400):
    f = open("./d/"+str(d+1)+".txt")
    corpus.append(f.read())
# add query to corpus
for q in range(225):
    f = open("./q/"+str(q+1)+".txt")
    corpus.append(f.read())

In [None]:

# Vector space model with TF-IDF weighting schema {matrix: #documents x #tokens}
vect = TfidfVectorizer()
X_tfidf = vect.fit_transform(corpus)

# Vector space model - boolean
vect = CountVectorizer(binary=True)
X_bin = vect.fit_transform(corpus)

# Vector space model - pure term frequency -> returns frequency of word in document regardless of other documents
vect = CountVectorizer(binary=False)
X_count = vect.fit_transform(corpus)


# Result data structure for each query = Precision, Recall and F-Measure for each type of term vector
result = {
    "query": 0,
    "tf_idf_cos_precision": np.nan,
    "tf_idf_cos_recall": np.nan,
    "tf_idf_cos_fmeasure": np.nan,
    
    "binary_cos_precision": np.nan,
    "binary_cos_recall": np.nan,
    "binary_cos_fmeasure": np.nan,
    
    "puretm_cos_precision": np.nan,
    "puretm_cos_recall": np.nan,
    "puretm_cos_fmeasure": np.nan,
    
    "tf_idf_eucl_precision": np.nan,
    "tf_idf_eucl_recall": np.nan,
    "tf_idf_eucl_fmeasure": np.nan,
    
    "binary_eucl_precision": np.nan,
    "binary_eucl_recall": np.nan,
    "binary_eucl_fmeasure": np.nan,
    
    "puretm_eucl_precision": np.nan,
    "puretm_eucl_recall": np.nan,
    "puretm_eucl_fmeasure": np.nan,
}

In [None]:
"""
Returns 10 most relevant documents [indexes] from d matrix for each query from q matrix
Args:
        d - document matrix, each row is one term vector 'identifying' one document
        q - query matrix, each row is one query term vector

Returns:
        res - list of vectors of most relevant documents. 
            There is as many vectors as there are queries in the input q matrix.
            Each vector consists of document indexes.
"""

# Cosine Based Proximity Measure - 1 if documents are identical (cos(phi) = 0)
def get_cosine_similarity(d, q):
    sim = np.array(cosine_similarity(q, d))
    res = []
    for i in range(q.shape[0]):
        res.append(sim[i].argsort()[-10:][::-1] + 1)
    return res

# Euclidean Based Proximity Measure - computes distance in multidim. eucl. space
def get_euclidean_similarity(d, q):
    euc = np.array(euclidean_distances(q, d))
    res = []
    for i in range(q.shape[0]):
        res.append(euc[i].argsort()[-10:][::-1] + 1)
    return res

In [None]:
### Reading reference solutions for comparison
ref_res = []
for q in range(225):
    f = open("./r/"+str(q+1)+".txt")
    text = f.read()
    ref_res.append(text.split("\n")[:-1])
    
"""
Returns 10 most relevant documents [indexes] from d matrix for each query from q matrix
Args:
        q - index of query vector {0 to len(all_queries)}
        matrix - matrix with term vectors of documents and queries
        docs - number of documents -> starting index for queries in matrix

Returns:
        cos[q], euclid[q] - two vectors of most relevant document indexes for each similarity measure, q is the query index
"""
def get_similarities(q, matrix, docs):
    queries = matrix[docs:]
    #print(queries.shape)
    documents = matrix[0:docs]
    #cosine
    cos = get_cosine_similarity(documents, queries)
    #euclidean
    euclid = get_euclidean_similarity(documents, queries)
    
    return cos[q], euclid[q]
    
    
q = 0

# list of cos[q], euclid[q] results for each query
tfidf_r = []
pure_r = []
bin_r = []
# list of references
reference = []

# get most relevant documents for each query, term vector type and measure:
for q in range(len(all_res)):
    if(len(ref_res[q])>0):
        ref2 = list(map(int, ref_res[q][:]))
    else:
        ref2 = []
        
    reference.append(ref2)
    
    cos, euclid = get_similarities(q,X_tfidf, len(documents))
    tfidf_r.append(cos)
    tfidf_r.append(euclid)
    
    cos_p, euclid_p = get_similarities(q,X_count, len(documents))
    pure_r.append(cos_p)
    pure_r.append(euclid_p)
    
    cos_b, euclid_b = get_similarities(q,X_bin, len(documents))
    bin_r.append(cos_b)
    bin_r.append(euclid_b)
    
print(len(tfidf_r))
print(len(pure_r))
print(len(bin_r))

In [None]:
"""
Returns number of true positives and false negatives for vector of found solutions 
Args:
        res - vector of found relevant documents
        ref - vector of true relevant documents

Returns:
        true_pos, false_neg - number of true positives and false negatives
"""
def getDiagonal(res, ref):
    true_pos = 0
    false_neg = 0
    for doc in ref:
        if doc in res:
            true_pos += 1 # document is in both datasets
        elif doc not in res:
            false_neg += 1 # document is in reference but not in my result
    return true_pos, false_neg
            
"""
Returns number of false positives for vector of found solutions 
Args:
        res - vector of found relevant documents
        ref - vector of true relevant documents

Returns:
        false_pos - number of falsely found relevant documents
"""
def getFalsePos(res, ref):
    false_pos = 0
    for doc in res:
        if doc not in ref:
            false_pos += 1 # document is in my results but not in reference
    return false_pos

"""
Returns precision for given query retrieval: - ratio of relevant documents to retrieved
"""
def getPrecision(true_pos, false_pos):
    return (float(true_pos)) / float(true_pos + false_pos)

"""
Returns recallfor given query retrieval: - ratio of retrieved documents to relevant {what should have been found}
"""
def getRecall(true_pos, false_neg):
    return (float(true_pos)) / float(true_pos + false_neg)

"""
Returns F-score
"""
def getFMeasure(precision, recall):
    if(precision == 0.0 or recall == 0.0):
        return 0.0
    else:
        return 2. * ( float(precision*recall) / float(precision + recall) )

In [None]:

"""
Returns Precision, Recall and F-score for each solution vector
Args:
        ref - vector of true relevant documents
        cos - vector of found relevant documents (cosine sim.)
        euc - vector of found relevant documents (eucl. dist)

Returns:
        result - list of 6 numbers: [ precision_cos, recall_cos, f1_cos, precision_eucl, recall_eucl, f1_eucl]
"""
def getRes(ref, cos, euc):
    # res = [ref, my_cos, my_euc]
    result = []
    for my_res in [cos, euc]:
        true_pos, false_neg = getDiagonal(my_res, ref)
        false_pos = getFalsePos(my_res, ref)
        p = getPrecision(true_pos, false_pos)
        r = getRecall(true_pos, false_neg)
        f = getFMeasure(p,r)
        result.append(p)
        result.append(r)
        result.append(f)
    
    return result


all_results = []
q_count = len(all_res)
for q in range(q_count):
    res = {}
    res['query'] = q
    
    ref_sol = reference[q]
    # TF-IDF results
    tfidf_res = getRes(ref_sol,tfidf_r[q*2], tfidf_r[q*2 + 1])

    res["tf_idf_cos_precision"] = tfidf_res[0]
    res["tf_idf_cos_recall"] = tfidf_res[1]
    res["tf_idf_cos_fmeasure"] = tfidf_res[2]
    res["tf_idf_eucl_precision"] = tfidf_res[3]
    res["tf_idf_eucl_recall"] = tfidf_res[4]
    res["tf_idf_eucl_fmeasure"] = tfidf_res[5]
    
     # Pure term frequency results
    puretm_res = getRes(ref_sol, pure_r[q*2], pure_r[q*2+1])
    res["puretm_cos_precision"] = puretm_res[0]
    res["puretm_cos_recall"] = puretm_res[1]
    res["puretm_cos_fmeasure"] = puretm_res[2]
    res["puretm_eucl_precision"] = puretm_res[3]
    res["puretm_eucl_recall"] = puretm_res[4]
    res["puretm_eucl_fmeasure"] = puretm_res[5]
    
    # Boolean model results
    bin_res = getRes(ref_sol, bin_r[q*2], bin_r[q*2+1])
    res["binary_cos_precision"] = bin_res[0]
    res["binary_cos_recall"] = bin_res[1]
    res["binary_cos_fmeasure"] = bin_res[2]
    res["binary_eucl_precision"] = bin_res[3]
    res["binary_eucl_recall"] = bin_res[4]
    res["binary_eucl_fmeasure"] = bin_res[5]

    all_results.append(res)
    


In [None]:
# Saving results to pandas data frame
tab = pd.DataFrame(all_results)

"""
Saving results to task2.csv:
    In each row are results for each query.
"""
tab.to_csv('task2.csv', sep=';')
print("Saved: task2.csv")

In [None]:
display(tab.head())
display(tab.info())
print('Mean {columns}:')
display(tab.mean())

print('Max {columns}:')
display(tab.max())

In [None]:
max_q = tab.loc[(tab['tf_idf_cos_recall'] == 1)]
max_q = max_q.drop(columns=['binary_eucl_fmeasure', 'binary_eucl_precision', 'binary_eucl_fmeasure'])

In [None]:
display(max_q)

In [None]:
all_results[32] # Check on 1.0 value on recall
print(reference[32])

In [None]:
print(tfidf_r[32*2]) 

In [None]:
#recall is 1:
#true_positive: is 4
#false negative is: 0
# data looks ok
print(getRes(reference[32], tfidf_r[32*2], tfidf_r[32*2+1]))