In [1]:
import pandas as pd
import string
import nltk
import os

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
lemmatizer = WordNetLemmatizer()

In [4]:
import numpy as np

In [5]:
Shakespeare_file_name = os.listdir('Shakespeare')

Consolidate Shakespeare documents in a list

In [6]:
documents_Shakespeare =[]
  
for i in Shakespeare_file_name:
    f = open('./Shakespeare/'+i,'r')
    documents_Shakespeare.append(f.read())

In [7]:
def preprocess(text):
    # Steps:
    # 1. lowercase
    # 2. Lammetize. 
    # 3. Remove stop words.
    # 4. Remove punctuations.
    # 5. Remove character with the length size of 1.

    lowered = str.lower(text)

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(lowered)

    words = []
    for w in word_tokens:
        if w not in stop_words:
            if w not in string.punctuation:
                if len(w) > 1:
                    lemmatized = lemmatizer.lemmatize(w)
                    words.append(lemmatized)

    return words

In [8]:
# Tokenize each Shakespeare document
all_tokens_Shakespeare = []
for i, document in enumerate(documents_Shakespeare):
    tokens = preprocess(document)
    all_tokens_Shakespeare.append(tokens)

    print("making word tokens at index:", i)

making word tokens at index: 0
making word tokens at index: 1
making word tokens at index: 2
making word tokens at index: 3
making word tokens at index: 4


In [9]:
def calculate_score(word_tokens1, word_tokens2):
    # Combine both tokens to find union.
    both_tokens = word_tokens1 + word_tokens2
    union = set(both_tokens)

    # Calculate intersection.
    intersection = set()
    for w in word_tokens1:
        if w in word_tokens2:
            intersection.add(w)

    score = len(intersection)/len(union)
    return score

Finding similar documents on basis of intersection score

In [10]:
for i in range(5):
    all_scores = []
    for j in range(5):
        score = calculate_score(all_tokens_Shakespeare[i], all_tokens_Shakespeare[j])
        all_scores.append(score)
            
    highest_score = 0
    highest_score_index = 0
    for index_score, score in enumerate(all_scores):
        if score!=1: # score with 1 is not checked as they are same documents 
            if highest_score < score:
                highest_score = score
                highest_score_index = index_score

    most_similar_document = Shakespeare_file_name[highest_score_index]

    print("Most similar document to "+Shakespeare_file_name[i]+" based on intersection score is "+most_similar_document+" with score value :", round(highest_score,4))
    print()

Most similar document to Coriolanus.txt based on intersection score is Othello.txt with score value : 0.207

Most similar document to HenryV.txt based on intersection score is Othello.txt with score value : 0.1981

Most similar document to King_Lear.txt based on intersection score is Othello.txt with score value : 0.2123

Most similar document to Othello.txt based on intersection score is King_Lear.txt with score value : 0.2123

Most similar document to Tempest.txt based on intersection score is Othello.txt with score value : 0.2081



tfidf cosine similarity

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
def tfidf_similarity(to_compare_doc, all_docs,j):
    vectorizer = TfidfVectorizer()

    #Combine the base document to all documents
    all_docs.insert(0, to_compare_doc)

    embeddings = vectorizer.fit_transform(all_docs)


    cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[1:]).flatten()

    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(cosine_similarities):
        if highest_score < score:
            highest_score = score
            highest_score_index = i


    most_similar_document = documents_Shakespeare.index(all_docs[highest_score_index+1])
    
    print("Very similar document to "+ Shakespeare_file_name[j]+" by TF-IDF is "+Shakespeare_file_name[most_similar_document]+" with the score ", highest_score)
    print()

In [13]:
for i in range(5):
    all_docs = []
    for j in range(5):
        if(i!=j):
            all_docs.append(documents_Shakespeare[j])    
    tfidf_similarity(documents_Shakespeare[i],all_docs,i)

Very similar document to Coriolanus.txt by TF-IDF is HenryV.txt with the score  0.7451584674974362

Very similar document to HenryV.txt by TF-IDF is Othello.txt with the score  0.801686387273153

Very similar document to King_Lear.txt by TF-IDF is HenryV.txt with the score  0.7799949163411519

Very similar document to Othello.txt by TF-IDF is HenryV.txt with the score  0.8016863872731527

Very similar document to Tempest.txt by TF-IDF is Othello.txt with the score  0.7460030234385655



In [14]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [19]:
def bert_similar(to_compare_doc, all_docs,j):
    vectorizer = TfidfVectorizer()

    #to_compare document
    sentences = sent_tokenize(to_compare_doc)
    to_compare_embeddings_sentences = model.encode(sentences)
    to_compare_embeddings = np.mean(np.array(to_compare_embeddings_sentences), axis=0)
    
    vectors = []
    for i, doc in enumerate(all_docs):

        sentences = sent_tokenize(doc)
        embeddings_sentences = model.encode(sentences)
        embeddings = np.mean(np.array(embeddings_sentences), axis=0)

        vectors.append(embeddings)

    cosine_similarities = cosine_similarity([to_compare_embeddings], vectors).flatten()
    print(cosine_similarities)

    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(cosine_similarities):
        if highest_score < score:
            highest_score = score
            highest_score_index = i


    most_similar_document = documents_Shakespeare.index(all_docs[highest_score_index])
    
    print("Very similar document to "+ Shakespeare_file_name[j]+" by bert is "+Shakespeare_file_name[most_similar_document]+" with the score ", highest_score)


In [20]:
for i in range(5):
    all_docs = []
    for j in range(5):
        if(i!=j):
            all_docs.append(documents_Shakespeare[j])    
    bert_similar(documents_Shakespeare[i],all_docs,i)

[0.95681614 0.9730467  0.9779606  0.9694467 ]
Very similar document to Coriolanus.txt by bert is Othello.txt with the score  0.9779606
[0.9568161  0.9651735  0.9625256  0.93385017]
Very similar document to HenryV.txt by bert is King_Lear.txt with the score  0.9651735
[0.97304666 0.9651735  0.9752283  0.97447664]
Very similar document to King_Lear.txt by bert is Othello.txt with the score  0.9752283
[0.9779606  0.9625256  0.97522825 0.9748869 ]
Very similar document to Othello.txt by bert is Coriolanus.txt with the score  0.9779606
[0.9694467 0.9338502 0.9744766 0.9748869]
Very similar document to Tempest.txt by bert is Othello.txt with the score  0.9748869


Recommended content: 

who viewed Coriolanus should be recommended- Othello

who viewed HenryV should be recommended- King_Lear

who viewed King_Lear should be recommended- Othello, Coriolanus

who viewed Othello should be recommended- Coriolanus

who viewed Tempest should be recommended- Othello, Coriolanus