In [23]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import nltk.stem.porter as porter
from collections import Counter

In [24]:
data = pd.read_csv('tf-df.csv')

In [25]:
doc_headers = [column for column in data.columns if 'Document' in column]

In [26]:
def ntc_normalize(tf, df):
    vecs = tf.copy()
    norm_doc_freq = np.log(len(vecs.columns)/df)
    # tf.df
    vecs = vecs.multiply(norm_doc_freq, axis='rows')
    # Cosine normalization:
    sum_sq = np.sum(vecs**2, axis=0)
    vecs = vecs/np.sqrt(sum_sq)
    return vecs

In [27]:
def nnc_normalize(tf, df):
    vecs = tf.copy()
    norm_doc_freq = 1
    # tf.df
    #vecs = vecs.multiply(norm_doc_freq, axis='rows')
    # Cosine normalization:
    sum_sq = np.sum(vecs**2, axis=0)
    vecs = vecs/np.sqrt(sum_sq)
    return vecs

In [28]:
def get_stemmed_tokenized_query(query):
    stemmer = porter.PorterStemmer(mode='ORIGINAL_ALGORITHM')
    #get tokenized words from query
    words = word_tokenize(query)
    #get stemmed words from query
    words = list(map(stemmer.stem, words))
    return words

In [29]:
# word_series is the series of words in our data, i.e data['word']
def get_query_tf(query, word_series):
    tf = word_series.copy()
    words = get_stemmed_tokenized_query(query)
    #count the occurences
    freq = Counter(words)
    #set the value as the count if it exists, else 0
    tf = tf.transform(lambda word: freq[word] if word in freq else 0)
    return tf

In [30]:
query_tf = get_query_tf("I enjoy the fall content", data['word'])

In [31]:
norm_query = nnc_normalize(query_tf, data['df'])

In [32]:
norm_vecs = ntc_normalize(data[doc_headers], data['df'])

In [33]:
norm_vecs.multiply(norm_query, axis=0).sum(axis=0).idxmax()

'Document 1'

In [34]:
doc_headers.index('Document 6')

5

In [35]:
dc = doc_headers.copy()

In [36]:
def get_similarity_scores(norm_docs, norm_query):
    cosine_prod = norm_vecs.multiply(norm_query, axis=0)
    return cosine_prod.sum(axis=0)

In [37]:
scores = get_similarity_scores(norm_vecs, norm_query)

In [38]:
#Index of maximum, which is our most similar document
scores.idxmax()

'Document 1'

In [39]:
scores['Document 1']

0.09445785191765765

In [40]:
titles_previews = pd.read_csv('title_preview.csv')

In [41]:
titles_previews.query('Document in '+ str(doc_headers[:3]))

Unnamed: 0,Document,title,preview,url
0,Document 1,Freeman Moore - SMU Spring 2018,"Spring 2018 Freeman L. Moore , PhD email : fmo...",https://s2.smu.edu/~fmoore/
1,Document 2,SMU CSE 5337/7337 Spring 2018 Schedule,SMU CSE 5337/7337 2018 Preliminary Schedule Th...,https://s2.smu.edu/~fmoore/schedule.htm
2,Document 3,SMU CSE 5/7337 Spring 2018 Textfiles,Textfiles for clustering golf 1 golf 2 golf 3 ...,https://s2.smu.edu/~fmoore/textfiles/index.html


In [42]:
titles_previews = pd.read_csv('title_preview.csv')

In [43]:
titles_previews = titles_previews.fillna("")

In [44]:
titles_previews['title'] = titles_previews['title'].apply(lambda word: word.lower())

In [45]:
def get_score(string, words):
    intersection = [word for word in words if word in string]
    return 0.25 if len(intersection)>0 else 0

In [46]:
def get_score_by_title(scores, titles_previews, query_words):
    modif = scores.copy()
    titles = titles_previews['title'].copy()
    titles.index = modif.index
    titles = titles.transform(lambda string: get_score(string, words))
    return titles+modif

In [47]:
query_words = get_stemmed_tokenized_query("I enjoy the fall content")

In [48]:
query_words

['i', 'enjoi', 'the', 'fall', 'content']

In [49]:
scores[scores.values > 0]

Document 1     0.094458
Document 2     0.023847
Document 5     0.012996
Document 9     0.010158
Document 10    0.023992
Document 12    0.009642
Document 13    0.050919
Document 14    0.010168
Document 15    0.048138
Document 16    0.020010
Document 17    0.026115
Document 18    0.028178
Document 19    0.012300
Document 21    0.045527
Document 22    0.014946
Document 24    0.012926
Document 26    0.014952
Document 27    0.059223
Document 31    0.027590
dtype: float64

In [50]:
tp = titles_previews.copy()

In [51]:
import scorer

In [52]:
query_tf = scorer.get_query_tf("buildingone buildingtwo buildingthree", data['word'])

In [53]:
norm_query = scorer.nnc_normalize(query_tf, data['df'])

In [54]:
norm_query[norm_query!=0]

233    0.57735
234    0.57735
235    0.57735
Name: word, dtype: float64

In [55]:
doc_tf = data[doc_headers]

In [56]:
norm_doc = scorer.nnc_normalize(doc_tf, data['df'])

In [57]:
import scorer

In [59]:
scorer.data[doc_headers]

Unnamed: 0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,Document 10,...,Document 22,Document 23,Document 24,Document 25,Document 26,Document 27,Document 28,Document 29,Document 30,Document 31
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,2,0,0,0,2,4,...,1,0,1,0,1,1,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,4,2,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
scorer.norm_doc_vec

Unnamed: 0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,Document 10,...,Document 22,Document 23,Document 24,Document 25,Document 26,Document 27,Document 28,Document 29,Document 30,Document 31
0,0.098533,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,0.197066,0.036131,0.000000,0.000000,0.131590,0.0,0.0,0.0,0.095238,0.214115,...,0.133631,0.000000,0.133631,0.0,0.160128,0.096225,0.000000,0.000000,0.0,0.000000
2,0.098533,0.036131,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.048002
3,0.098533,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.095238,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.098533,0.072263,0.109764,0.111803,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
5,0.098533,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
6,0.098533,0.000000,0.000000,0.000000,0.065795,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.081111,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
7,0.098533,0.036131,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.152499,0.0,0.000000
8,0.394132,0.072263,0.109764,0.111803,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.160128,0.000000,0.0,0.000000
9,0.098533,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [61]:
norm_doc

Unnamed: 0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,Document 10,...,Document 22,Document 23,Document 24,Document 25,Document 26,Document 27,Document 28,Document 29,Document 30,Document 31
0,0.098533,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,0.197066,0.036131,0.000000,0.000000,0.131590,0.0,0.0,0.0,0.095238,0.214115,...,0.133631,0.000000,0.133631,0.0,0.160128,0.096225,0.000000,0.000000,0.0,0.000000
2,0.098533,0.036131,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.048002
3,0.098533,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.095238,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.098533,0.072263,0.109764,0.111803,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
5,0.098533,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
6,0.098533,0.000000,0.000000,0.000000,0.065795,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.081111,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
7,0.098533,0.036131,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.152499,0.0,0.000000
8,0.394132,0.072263,0.109764,0.111803,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.160128,0.000000,0.0,0.000000
9,0.098533,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [62]:
demo = data.copy()

In [63]:
demo = demo.loc[:8, doc_headers[:3]]

In [64]:
demo['Document 1'] = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1])
demo['Document 2'] = np.array([1, 1, 1, 0, 1, 1, 1, 0, 0])
demo['Document 3'] = np.array([0, 0, 0, 1, 1, 2, 0, 0, 1])

In [65]:
demo['df'] = np.sum(demo, axis=1)

In [66]:
demo

Unnamed: 0,Document 1,Document 2,Document 3,df
0,0,1,0,1
1,1,1,0,2
2,0,1,0,1
3,0,0,1,1
4,0,1,1,2
5,1,1,2,4
6,1,1,0,2
7,1,0,0,1
8,1,0,1,2


In [67]:
demo['word'] = ['is', 'made', 'of', 'or', 'steel', 'table', 'the', 'they', 'wood']

In [68]:
demo

Unnamed: 0,Document 1,Document 2,Document 3,df,word
0,0,1,0,1,is
1,1,1,0,2,made
2,0,1,0,1,of
3,0,0,1,1,or
4,0,1,1,2,steel
5,1,1,2,4,table
6,1,1,0,2,the
7,1,0,0,1,they
8,1,0,1,2,wood


In [69]:
query = "the wood table"

In [70]:
def get_query_tf(query, word_series):
    tf = word_series.copy()
    words = word_tokenize(query)
    
    # count the occurrences
    freq = Counter(words)
    # set the value as the count if it exists, else 0
    tf = tf.transform(lambda word: freq[word] if word in freq else 0)
    return tf

In [71]:
q = get_query_tf("the wood table", demo['word'])

In [72]:
vq = nnc_normalize(q, demo['df'])

In [73]:
dd = nnc_normalize(demo[doc_headers[:3]], demo['df'])

In [74]:
scorer.get_similarity_scores(dd, vq)

Document 1    0.774597
Document 2    0.471405
Document 3    0.654654
dtype: float64

In [75]:
np.arange(0, 25, 5)

array([ 0,  5, 10, 15, 20])

In [76]:
data2 = data.copy()

In [77]:
import scorer

In [78]:
scorer.lnc_normalize(data2[doc_headers], data2['df'])

Unnamed: 0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,Document 10,...,Document 22,Document 23,Document 24,Document 25,Document 26,Document 27,Document 28,Document 29,Document 30,Document 31
0,0.124853,,,,,,,,,,...,,,,,,,,,,
1,0.211395,0.065301,,,0.144176,,,,0.114941,0.167692,...,0.160474,,0.144331,,0.1676,0.110557,,,,
2,0.124853,0.065301,,,,,,,,,...,,,,,,,,,,0.074402
3,0.124853,,,,,,,,0.114941,,...,,,,,,,,,,
4,0.124853,0.110564,0.19141,0.155835,,,,,,,...,,,,,,,,,,
5,0.124853,,,,,,,,,,...,,,,,,,,,,
6,0.124853,,,,0.085153,,,,,,...,,0.091707,,,,,,,,
7,0.124853,0.065301,,,,,,,,,...,,,,,,,,0.1638,,
8,0.297937,0.110564,0.19141,0.155835,,,,,,,...,,,,,,,0.176226,,,
9,0.124853,,,,,,,,,,...,,,,,,,,,,


In [79]:
def ltc_normalize(tf, df):
    vecs = tf.copy()
    # log normalization
    vecs = vecs[vecs!=0]
    vecs = np.log(vecs) + 1
    vecs = vecs.fillna(1)
    # df normalization
    norm_doc_freq = np.log(len(doc_headers) / df)
    # tf.df
    vecs = vecs.multiply(norm_doc_freq, axis='rows')
    # Cosine normalization:
    sum_sq = np.sum(vecs**2, axis=0)
    vecs = vecs/np.sqrt(sum_sq)
    return vecs

In [80]:
ltc_normalize(data2[doc_headers], data2['df'])

Unnamed: 0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7,Document 8,Document 9,Document 10,...,Document 22,Document 23,Document 24,Document 25,Document 26,Document 27,Document 28,Document 29,Document 30,Document 31
0,0.031497,0.030219,0.031682,0.031664,0.031395,0.031797,0.031797,0.031778,0.031143,0.031366,...,0.031729,0.031402,0.031697,0.031621,0.031734,0.031651,0.031679,0.031712,0.030676,0.030756
1,0.009330,0.005287,0.005543,0.005540,0.009300,0.005563,0.005563,0.005559,0.009225,0.013095,...,0.005551,0.005494,0.005545,0.005532,0.005552,0.005537,0.005542,0.005548,0.005367,0.005381
2,0.021420,0.020551,0.021546,0.021534,0.021351,0.021624,0.021624,0.021611,0.021180,0.021331,...,0.021578,0.021356,0.021557,0.021505,0.021582,0.021525,0.021544,0.021567,0.020862,0.020916
3,0.025139,0.024119,0.025287,0.025273,0.025058,0.025379,0.025379,0.025363,0.042087,0.025035,...,0.025324,0.025063,0.025299,0.025238,0.025329,0.025262,0.025284,0.025311,0.024484,0.024548
4,0.018782,0.030510,0.018892,0.018882,0.018721,0.018961,0.018961,0.018949,0.018571,0.018704,...,0.018920,0.018725,0.018901,0.018856,0.018923,0.018873,0.018890,0.018910,0.018292,0.018340
5,0.025139,0.024119,0.025287,0.025273,0.025058,0.025379,0.025379,0.025363,0.024857,0.025035,...,0.025324,0.025063,0.025299,0.025238,0.025329,0.025262,0.025284,0.025311,0.024484,0.024548
6,0.021420,0.020551,0.021546,0.021534,0.021351,0.021624,0.021624,0.021611,0.021180,0.021331,...,0.021578,0.021356,0.021557,0.021505,0.021582,0.021525,0.021544,0.021567,0.020862,0.020916
7,0.021420,0.020551,0.021546,0.021534,0.021351,0.021624,0.021624,0.021611,0.021180,0.021331,...,0.021578,0.021356,0.021557,0.021505,0.021582,0.021525,0.021544,0.021567,0.020862,0.020916
8,0.035944,0.024469,0.015151,0.015143,0.015014,0.015206,0.015206,0.015197,0.014894,0.015000,...,0.015174,0.015017,0.015158,0.015122,0.015176,0.015136,0.015150,0.015166,0.014670,0.014708
9,0.025139,0.024119,0.025287,0.025273,0.025058,0.025379,0.025379,0.025363,0.024857,0.025035,...,0.025324,0.025063,0.025299,0.025238,0.025329,0.025262,0.025284,0.025311,0.024484,0.024548


In [84]:
scores.loc[scores>0]

Document 1     0.094458
Document 2     0.023847
Document 5     0.012996
Document 9     0.010158
Document 10    0.023992
Document 12    0.009642
Document 13    0.050919
Document 14    0.010168
Document 15    0.048138
Document 16    0.020010
Document 17    0.026115
Document 18    0.028178
Document 19    0.012300
Document 21    0.045527
Document 22    0.014946
Document 24    0.012926
Document 26    0.014952
Document 27    0.059223
Document 31    0.027590
dtype: float64

In [None]:
def log_norm(value):
    new_val = 0 if value==0 else np.log(value)
    return new_val+1

In [None]:
vecs.transform(lambda value: 0 if 0 else np.log(value)+1)

In [86]:
i

array([ 0.,  6., 12., 18., 24.])