In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
def TFIDF_cosine(test_series, train_series, tokenizer):
    """
    return the tf idf score of each possible pairs of documents
    Args:
        new_series (pd.Series): new data (To compare against train data)
        train_series (pd.Series): train data (To fit the tf-idf transformer)
    Returns:
        pd.DataFrame
    """

    train_tfidf = tokenizer.fit_transform(train_series)
    test_tfidf = tokenizer.transform(test_series)
    X = pd.DataFrame(cosine_similarity(test_tfidf, train_tfidf), columns=train_series.index)
    X['test_set'] = test_series.index
    score = pd.melt(
        X,
        id_vars='test_set',
        var_name='train_set',
        value_name='cosine_similarity'
    )
    return score

In [38]:
train_set = pd.Series(["The sky is blue.", "The sun is bright.", "The sun in the sky is bright."])
print("train_set =\n", train_set)
print("=============================================================================================================\n")

test_set = pd.Series(["The sun in the sky is bright."])
print("test_set =\n", test_set)
print("=============================================================================================================\n")

TfidfVectorizer_object = TfidfVectorizer() # initiate here your own tokenizer (TfidfVectorizer, CountVectorizer, with stopwords...)
#print("tokenizer =\n", tokenizer)
#print("=============================================================================================================\n")

score = TFIDF_cosine(train_series=train_set, test_series=test_set, tokenizer=TfidfVectorizer_object)
score
#print(score)

train_set =
 0                 The sky is blue.
1               The sun is bright.
2    The sun in the sky is bright.
dtype: object

test_set =
 0    The sun in the sky is bright.
dtype: object



Unnamed: 0,test_set,train_set,cosine_similarity
0,0,0,0.50889
1,0,1,0.764461
2,0,2,1.0
