In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import nltk.stem.porter as porter
from collections import Counter

In [3]:
data = pd.read_csv('tf-df.csv')

In [4]:
doc_headers = [column for column in data.columns if 'Document' in column]

In [5]:
def ntc_normalize(tf, df):
    vecs = tf.copy()
    norm_doc_freq = np.log(len(vecs.columns)/df)
    # tf.df
    vecs = vecs.multiply(norm_doc_freq, axis='rows')
    # Cosine normalization:
    sum_sq = np.sum(vecs**2, axis=0)
    vecs = vecs/sum_sq
    return vecs

In [6]:
def nnc_normalize(tf, df):
    vecs = tf.copy()
    norm_doc_freq = df
    # tf.df
    vecs = vecs.multiply(norm_doc_freq, axis='rows')
    # Cosine normalization:
    sum_sq = np.sum(vecs**2, axis=0)
    vecs = vecs/sum_sq
    return vecs

In [7]:
def get_stemmed_tokenized_query(query):
    stemmer = porter.PorterStemmer(mode='ORIGINAL_ALGORITHM')
    #get tokenized words from query
    words = word_tokenize(query)
    #get stemmed words from query
    words = list(map(stemmer.stem, words))
    return words

In [8]:
# word_series is the series of words in our data, i.e data['word']
def get_query_tf(query, word_series):
    tf = word_series.copy()
    words = get_stemmed_tokenized_query(query)
    #count the occurences
    freq = Counter(words)
    #set the value as the count if it exists, else 0
    tf = tf.transform(lambda word: freq[word] if word in freq else 0)
    return tf

In [9]:
query_tf = get_query_tf("I enjoy the fall content", data['word'])

In [10]:
norm_query = nnc_normalize(query_tf, data['df'])

In [11]:
norm_vecs = ntc_normalize(data[doc_headers], data['df'])

In [12]:
def get_similarity_scores(norm_docs, norm_query):
    cosine_prod = norm_vecs.multiply(norm_query, axis=0)
    return cosine_prod.sum(axis=0)

In [13]:
scores = get_similarity_scores(norm_vecs, norm_query)

In [14]:
#Index of maximum, which is our most similar document
scores.idxmax()

'Document 16'

In [15]:
scores

Document 1     0.000089
Document 2     0.000006
Document 3     0.000000
Document 4     0.000000
Document 5     0.000000
Document 6     0.000042
Document 7     0.000029
Document 8     0.000012
Document 9     0.000136
Document 10    0.000054
Document 11    0.000189
Document 12    0.000094
Document 13    0.000102
Document 14    0.000000
Document 15    0.000000
Document 16    0.000220
Document 17    0.000072
Document 18    0.000058
Document 19    0.000057
Document 20    0.000081
Document 21    0.000097
Document 22    0.000073
Document 23    0.000075
dtype: float64

In [122]:
titles_previews = pd.read_csv('title_preview.csv')

In [17]:
titles_previews = titles_previews.fillna("")

In [18]:
titles_previews['title'] = titles_previews['title'].apply(lambda word: word.lower())

In [19]:
def get_score(string, words):
    intersection = [word for word in words if word in string]
    return 0.25 if len(intersection)>0 else 0

In [20]:
def get_score_by_title(scores, titles_previews, query_words):
    modif = scores.copy()
    titles = titles_previews['title'].copy()
    titles.index = modif.index
    titles = titles.transform(lambda string: get_score(string, words))
    return titles+modif

In [21]:
query_words = get_stemmed_tokenized_query("I enjoy the fall content")

In [22]:
query_words

['i', 'enjoi', 'the', 'fall', 'content']

In [120]:
scores[scores.values > 0]

Document 1     0.000089
Document 2     0.000006
Document 6     0.000042
Document 7     0.000029
Document 8     0.000012
Document 9     0.000136
Document 10    0.000054
Document 11    0.000189
Document 12    0.000094
Document 13    0.000102
Document 16    0.000220
Document 17    0.000072
Document 18    0.000058
Document 19    0.000057
Document 20    0.000081
Document 21    0.000097
Document 22    0.000073
Document 23    0.000075
dtype: float64

In [139]:
tp = titles_previews.copy()

In [140]:
tp['scores'] = scores.values

ValueError: Length of values does not match length of index

In [150]:
tp.loc[np.arange(23), 'scores'] = scores.values

In [151]:
tp

Unnamed: 0,Document,title,preview,url,scores
0,Document 1,Freeman Moore - SMU Spring 2018,"Spring 2018 Freeman L. Moore , PhD email : fmo...",https://s2.smu.edu/~fmoore/,8.9e-05
1,Document 2,SMU CSE 5337/7337 Spring 2018 Schedule,SMU CSE 5337/7337 2018 Preliminary Schedule Th...,https://s2.smu.edu/~fmoore/schedule.htm,6e-06
2,Document 3,SMU CSE 5/7337 Spring 2018 Textfiles,Textfiles for clustering golf 1 golf 2 golf 3 ...,https://s2.smu.edu/~fmoore/textfiles/index.html,0.0
3,Document 4,"SMU CSE 5/7337 Spring 2018 text files""",additional text files to support query impleme...,https://s2.smu.edu/~fmoore/misc/text/index.php,0.0
4,Document 5,CSE 7337 Spring 2018 distance students exam 1 ...,CSE 7337 Distance student exam 1 location ( in...,https://s2.smu.edu/~fmoore/misc/exam1.html,0.0
5,Document 6,CSE 5337/7337 User-Agent,This is the user-agent information received va...,https://s2.smu.edu/~fmoore/misc/useragent.php,4.2e-05
6,Document 7,Levenshtein Distance demo,Levenshtein Distance calculator For example : ...,https://s2.smu.edu/~fmoore/misc/levenshtein.html,2.9e-05
7,Document 8,Porter Stemmer Online,Javascript Porter Stemmer Online Find out more...,https://s2.smu.edu/~fmoore/misc/porter_stemmer...,1.2e-05
8,Document 9,baseball5,"Terence Mann : Ray , people will come Ray . Th...",https://s2.smu.edu/~fmoore/textfiles/baseball5...,0.000136
9,Document 10,baseball4,The oddsmakers at BetDSI Sportsbook have Ohtan...,https://s2.smu.edu/~fmoore/textfiles/baseball4...,5.4e-05
