In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
import re
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF

In [2]:
# Data
sentences = [
    'seven triple six triple four triple',
    'one double three double five six seven',
    'seven two triple six triple five triple',
    'seven four three double five double',
    'seven two double six double four',
    'seven double four five double six three',
    'seven triple six triple five triple'
]

In [3]:
# Instantiate count vectorizer
counter = CountVectorizer()

# Fit and transform sentences
counts = counter.fit_transform(sentences)
#print(counts)

# Get words from sentences
words = counter.get_feature_names()

# Convert matrix to dense matrix and label columns
counts = pd.DataFrame(counts.todense(),columns=words)
display(counts)

# Dataframe to hold vectorized sentences
dfcf = pd.DataFrame(index=words)

# Get document freqs
dfcf['df'] = counts.astype(bool).sum(axis=0)

# Get count freqs for words in each sentence
dfcf['cf'] = counts.sum(axis=0)

# Calculate idf -> log2(num docs/df)
dfcf['idf'] = np.log2(7/(dfcf.df))

display(dfcf)

# Concat sentence data to df, cf, and idf
dfcf = pd.concat([counts.T,dfcf],axis=1)
display(dfcf)

Unnamed: 0,double,five,four,one,seven,six,three,triple,two
0,0,0,1,0,1,1,0,3,0
1,2,1,0,1,1,1,1,0,0
2,0,1,0,0,1,1,0,3,1
3,2,1,1,0,1,0,1,0,0
4,2,0,1,0,1,1,0,0,1
5,2,1,1,0,1,1,1,0,0
6,0,1,0,0,1,1,0,3,0


Unnamed: 0,df,cf,idf
double,4,8,0.807355
five,5,5,0.485427
four,4,4,0.807355
one,1,1,2.807355
seven,7,7,0.0
six,6,6,0.222392
three,3,3,1.222392
triple,3,9,1.222392
two,2,2,1.807355


Unnamed: 0,0,1,2,3,4,5,6,df,cf,idf
double,0,2,0,2,2,2,0,4,8,0.807355
five,0,1,1,1,0,1,1,5,5,0.485427
four,1,0,0,1,1,1,0,4,4,0.807355
one,0,1,0,0,0,0,0,1,1,2.807355
seven,1,1,1,1,1,1,1,7,7,0.0
six,1,1,1,0,1,1,1,6,6,0.222392
three,0,1,0,1,0,1,0,3,3,1.222392
triple,3,0,3,0,0,0,3,3,9,1.222392
two,0,0,1,0,1,0,0,2,2,1.807355


In [4]:
sent_col = [i for i in range(0,7)]
for col in dfcf.iloc[:, :7]:
    #print(dfcf[col])
    dfcf['tfidf'+str(col)] = round(dfcf[col] * dfcf.idf,4)

In [5]:
display(dfcf)

Unnamed: 0,0,1,2,3,4,5,6,df,cf,idf,tfidf0,tfidf1,tfidf2,tfidf3,tfidf4,tfidf5,tfidf6
double,0,2,0,2,2,2,0,4,8,0.807355,0.0,1.6147,0.0,1.6147,1.6147,1.6147,0.0
five,0,1,1,1,0,1,1,5,5,0.485427,0.0,0.4854,0.4854,0.4854,0.0,0.4854,0.4854
four,1,0,0,1,1,1,0,4,4,0.807355,0.8074,0.0,0.0,0.8074,0.8074,0.8074,0.0
one,0,1,0,0,0,0,0,1,1,2.807355,0.0,2.8074,0.0,0.0,0.0,0.0,0.0
seven,1,1,1,1,1,1,1,7,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
six,1,1,1,0,1,1,1,6,6,0.222392,0.2224,0.2224,0.2224,0.0,0.2224,0.2224,0.2224
three,0,1,0,1,0,1,0,3,3,1.222392,0.0,1.2224,0.0,1.2224,0.0,1.2224,0.0
triple,3,0,3,0,0,0,3,3,9,1.222392,3.6672,0.0,3.6672,0.0,0.0,0.0,3.6672
two,0,0,1,0,1,0,0,2,2,1.807355,0.0,0.0,1.8074,0.0,1.8074,0.0,0.0


In [6]:
vectorizer = TfidfVectorizer(norm=None, use_idf=True, smooth_idf=False, binary=False)
sent_tfidf = vectorizer.fit_transform(sentences)


tf_words = vectorizer.get_feature_names()
cols = ['tfidf'+str(x) for x in range(7)]
display(pd.DataFrame(sent_tfidf.todense().T, index=tf_words, columns = cols))
print('original counts')
display(counts.T)

print(vectorizer.vocabulary_,vectorizer.get_feature_names())

Unnamed: 0,tfidf0,tfidf1,tfidf2,tfidf3,tfidf4,tfidf5,tfidf6
double,0.0,3.119232,0.0,3.119232,3.119232,3.119232,0.0
five,0.0,1.336472,1.336472,1.336472,0.0,1.336472,1.336472
four,1.559616,0.0,0.0,1.559616,1.559616,1.559616,0.0
one,0.0,2.94591,0.0,0.0,0.0,0.0,0.0
seven,1.0,1.0,1.0,1.0,1.0,1.0,1.0
six,1.154151,1.154151,1.154151,0.0,1.154151,1.154151,1.154151
three,0.0,1.847298,0.0,1.847298,0.0,1.847298,0.0
triple,5.541894,0.0,5.541894,0.0,0.0,0.0,5.541894
two,0.0,0.0,2.252763,0.0,2.252763,0.0,0.0


original counts


Unnamed: 0,0,1,2,3,4,5,6
double,0,2,0,2,2,2,0
five,0,1,1,1,0,1,1
four,1,0,0,1,1,1,0
one,0,1,0,0,0,0,0
seven,1,1,1,1,1,1,1
six,1,1,1,0,1,1,1
three,0,1,0,1,0,1,0
triple,3,0,3,0,0,0,3
two,0,0,1,0,1,0,0


{'seven': 4, 'triple': 7, 'six': 5, 'four': 2, 'one': 3, 'double': 0, 'three': 6, 'five': 1, 'two': 8} ['double', 'five', 'four', 'one', 'seven', 'six', 'three', 'triple', 'two']


In [32]:
temp = dict(zip(vectorizer.get_feature_names(),vectorizer.idf_))
print(temp)
display(dfcf.idf)
temp['double']
'doubles' in temp

{'double': 1.5596157879354227, 'five': 1.336472236621213, 'four': 1.5596157879354227, 'one': 2.9459101490553135, 'seven': 1.0, 'six': 1.1541506798272583, 'three': 1.8472978603872037, 'triple': 1.8472978603872037, 'two': 2.252762968495368}


double    0.807355
five      0.485427
four      0.807355
one       2.807355
seven     0.000000
six       0.222392
three     1.222392
triple    1.222392
two       1.807355
Name: idf, dtype: float64

False

In [88]:
temp1 = pd.DataFrame(sent_tfidf.todense(), columns=tf_words, index = range(7))
totals = temp1.values
rows = ['tfidf0','tfidf1','tfidf2','tfidf3','tfidf4','tfidf5','tfidf6']
t = list(zip(rows,totals))
print(t[0])
#for x in totals:
    #print(x)
    #print('--')

('tfidf0', array([0.        , 0.        , 1.55961579, 0.        , 1.        ,
       1.15415068, 0.        , 5.54189358, 0.        ]))


In [126]:
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

totals = temp1.values
def brain_food(input_list):
    """Function that takes up to 10 ingredients and 
    returns recipes with keywords that are the most similar to the input"""
        
    # Get idf score for each word entered.  If word is not in corpus throw error.
    idf_dict = dict(zip(vectorizer.get_feature_names(),vectorizer.idf_))
    
    input_vect = []
    for word in input_list:
        if word in idf_dict:
            input_vect.append(idf_dict[word])
            
        else:
            input_vect.append(0)
            print('{} not in corpus - no idf available'.format(word))

    # Make sure array has same number of elements as keywords
    num_keywords = (9)
    ingredients = np.zeros(num_keywords)
    
    ingredients = np.pad(input_vect, (0,(9-len(input_vect))), 'constant', constant_values=(0))
    print('Your vector:\n ',ingredients)
    
    # Get cosine similarity between input and rows
    sims = {}
    totals = temp1.values
    rows = ['tfidf0','tfidf1','tfidf2','tfidf3','tfidf4','tfidf5','tfidf6']
    t = list(zip(rows,totals))
    for label,array in t:
        sims[label] = cosine_similarity(ingredients.reshape(1, -1),array.reshape(1, -1))[0]
        sims_df = pd.Series(sims).sort_values(ascending=False)
        
        
        
    similarity = np.dot(totals, ingredients.T)
    
    square_mag = np.diag(similarity)
    # inverse squared magnitude
    inv_square_mag = 1 / square_mag
    # if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
    inv_square_mag[numpy.isinf(inv_square_mag)] = 0
    # inverse of the magnitude
    inv_mag = numpy.sqrt(inv_square_mag)
    # cosine similarity (elementwise multiply by inverse magnitudes)
    cosine = similarity * inv_mag
    cosine = cosine.T * inv_mag
    
    lin_ker = linear_kernel(ingredients.reshape(1, -1),array.reshape(1, -1))
    return sims_df, similarity, lin_ker, 
    
brain_food(['double', 'seven', 'three', 'one', 'second', 'triple'])

second not in corpus - no idf available
Your vector:
  [1.55961579 1.         1.84729786 2.94591015 0.         1.84729786
 0.         0.         0.        ]


(tfidf1     [0.7675948709422709]
 tfidf5     [0.5807396392619847]
 tfidf4     [0.5131448005361537]
 tfidf3     [0.4870931363874374]
 tfidf0    [0.19341659985313778]
 tfidf6    [0.13505854746597593]
 tfidf2    [0.12617871918953572]
 dtype: object,
 array([ 5.01313499, 17.01172174,  3.46853232,  9.08234996,  9.8779378 ,
        11.21441004,  3.46853232]),
 array([[3.46853232]]))

Not sure why the idf vector calculated in SKLEARN doesn't match the hand calculated values.  The IDF should equal the log2 of the total number of documents divided by the number of times the terms appears in the document.  What I am not understanding is how the IDF for the term "seven" is equal to 1 in the SKLEARN IDF vector, when the log2(7/7) = 0.  In order for the IDF for "seven" to be 1.0 the ratio of documents to number of times a term appears in a document would have equal 2.  Since the number of documents isn't changing that would imply that the number of times "seven" appears in the document is 

In [127]:
totals[0].reshape(1,len(totals[0]))
0%10000

0

In [9]:
7/(2**1.4700035)

2.5268699619321677

In [10]:
np.log2(3)*3

4.754887502163468

In [11]:
temp1 = pd.DataFrame(sent_tfidf.todense(), columns=tf_words, index = cols)

In [12]:
temp.apply(lambda x: x.max(),axis=1)

tfidf0    5.541894
tfidf1    3.119232
tfidf2    5.541894
tfidf3    3.119232
tfidf4    3.119232
tfidf5    3.119232
tfidf6    5.541894
dtype: float64

In [13]:
def top_n_idx_sparse(matrix, n=3):
    '''Return index of top n values in each row of a sparse matrix'''
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]])
    return top_n_idx

In [14]:
rr = top_n_idx_sparse(sent_tfidf)
print(len(rr[0]))
print(rr)
rr_flat = [l for subl in rr for l in subl]
print(rr_flat)

3
[array([5, 7, 2], dtype=int32), array([6, 3, 0], dtype=int32), array([1, 7, 8], dtype=int32), array([2, 6, 0], dtype=int32), array([2, 0, 8], dtype=int32), array([2, 6, 0], dtype=int32), array([5, 7, 1], dtype=int32)]
[5, 7, 2, 6, 3, 0, 1, 7, 8, 2, 6, 0, 2, 0, 8, 2, 6, 0, 5, 7, 1]


In [15]:
vocab = vectorizer.vocabulary_

In [16]:
print(vocab)
print(vocab.values())
print(vocab.keys())
print(vocab.items())

{'seven': 4, 'triple': 7, 'six': 5, 'four': 2, 'one': 3, 'double': 0, 'three': 6, 'five': 1, 'two': 8}
dict_values([4, 7, 5, 2, 3, 0, 6, 1, 8])
dict_keys(['seven', 'triple', 'six', 'four', 'one', 'double', 'three', 'five', 'two'])
dict_items([('seven', 4), ('triple', 7), ('six', 5), ('four', 2), ('one', 3), ('double', 0), ('three', 6), ('five', 1), ('two', 8)])


In [17]:
def get_keywords(dictionary, index_list):
    '''Returns keys that match values in dictionary'''
    matching_keys = []
    dict_items = dictionary.items()
    
    for item in dict_items:
        if item[1] in index_list:
            matching_keys.append(item[0])
    return matching_keys

In [18]:
def get_keywords1(dictionary, index_list, n=3):
    '''Returns keys that match values in dictionary'''
    matching_keys = []
    dict_items = dictionary.items()
    
    # loop through flat list of indices - faster that get_keywords because allows a flatlist to be searched 
    # instead of requiring a loop through each list of indices
    for x in index_list:
        for item in dict_items:
            if x == item[1]:
                matching_keys.append(item[0])
                
    # Split the results into the n number of keywords
    i=0
    new_list=[]
    while i<len(matching_keys):
        new_list.append(matching_keys[i:i+n])
        i+=n
    return new_list

In [19]:
t0=time.time()
holder = []
for x in range(len(rr)):
    holder.append(get_keywords(vocab,rr[x]))
print('{}s'.format(time.time()-t0))
print(holder)

0.000993967056274414s
[['triple', 'six', 'four'], ['one', 'double', 'three'], ['triple', 'five', 'two'], ['four', 'double', 'three'], ['four', 'double', 'two'], ['four', 'double', 'three'], ['triple', 'six', 'five']]


In [20]:
print(rr[0])

[5 7 2]


In [21]:
t0=time.time()
print(get_keywords1(vocab,rr_flat))
print('{}s'.format(time.time()-t0))

[['six', 'triple', 'four'], ['three', 'one', 'double'], ['five', 'triple', 'two'], ['four', 'three', 'double'], ['four', 'double', 'two'], ['four', 'three', 'double'], ['six', 'triple', 'five']]
0.0s


In [22]:
temp

Unnamed: 0,double,five,four,one,seven,six,three,triple,two
tfidf0,0.0,0.0,1.559616,0.0,1.0,1.154151,0.0,5.541894,0.0
tfidf1,3.119232,1.336472,0.0,2.94591,1.0,1.154151,1.847298,0.0,0.0
tfidf2,0.0,1.336472,0.0,0.0,1.0,1.154151,0.0,5.541894,2.252763
tfidf3,3.119232,1.336472,1.559616,0.0,1.0,0.0,1.847298,0.0,0.0
tfidf4,3.119232,0.0,1.559616,0.0,1.0,1.154151,0.0,0.0,2.252763
tfidf5,3.119232,1.336472,1.559616,0.0,1.0,1.154151,1.847298,0.0,0.0
tfidf6,0.0,1.336472,0.0,0.0,1.0,1.154151,0.0,5.541894,0.0


In [23]:
3e6*35000


105000000000.0