## import required package to clean the text

In [1]:
import pandas as pd
import string
import nltk

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Om\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
lemmatizer = WordNetLemmatizer()

In [7]:
import numpy as np
import os

## getting input text files

In [8]:
fables_files_name =  os.listdir('Fables')

Consolidate fables documents in a list

In [9]:
documents_fable =[]

for i in fables_files_name:
    f = open('./Fables/'+i,'r')
    documents_fable.append(f.read())  

### preprocessing text using nltk library

In [10]:
def preprocess(text):
    # Steps:
    # 1. lowercase
    # 2. Lammetize. 
    # 3. Remove stop words.
    # 4. Remove punctuations.
    # 5. Remove character with the length size of 1.

    lowered = str.lower(text)

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(lowered)

    words = []
    for w in word_tokens:
        if w not in stop_words:
            if w not in string.punctuation:
                if len(w) > 1:
                    lemmatized = lemmatizer.lemmatize(w)
                    words.append(lemmatized)

    return words

In [11]:
# Tokenize each Fable document
all_tokens_fable = []
for i, document in enumerate(documents_fable):
    tokens = preprocess(document)
    all_tokens_fable.append(tokens)

    print("making word tokens at index:", i)

making word tokens at index: 0
making word tokens at index: 1
making word tokens at index: 2
making word tokens at index: 3
making word tokens at index: 4
making word tokens at index: 5
making word tokens at index: 6
making word tokens at index: 7
making word tokens at index: 8
making word tokens at index: 9
making word tokens at index: 10
making word tokens at index: 11


# approach 1: using the intersection score- how many words are matching within each document 

In [12]:
def calculate_score(word_tokens1, word_tokens2):
    # Combine both tokens to find union.
    both_tokens = word_tokens1 + word_tokens2
    union = set(both_tokens)

    # Calculate intersection.
    intersection = set()
    for w in word_tokens1:
        if w in word_tokens2:
            intersection.add(w)

    score = len(intersection)/len(union)
    return score

Finding similar documents on basis of intersection score

In [15]:
for i in range(12):
    all_scores = []
    for j in range(12):
        score = calculate_score(all_tokens_fable[i], all_tokens_fable[j])
        all_scores.append(score)
            
    highest_score = 0
    highest_score_index = 0
    for index_score, score in enumerate(all_scores):
        if score!=1: # score with 1 is not checked as they are same documents 
            if highest_score < score:
                highest_score = score
                highest_score_index = index_score

    most_similar_document = fables_files_name[highest_score_index]

    print("Most similar document to "+fables_files_name[i]+" based on intersection score is "+most_similar_document+" with score value :", round(highest_score,4))
    print()

Most similar document to The_Ass_and_the_Lapdog.txt based on intersection score is The_Frogs_Desiring_a_King.txt with score value : 0.0952

Most similar document to The_Cock_and_the_Pearl.txt based on intersection score is The_Wolf_and_the_Crane.txt with score value : 0.0707

Most similar document to The_Dog_and_the_Shadow.txt based on intersection score is The_Fox_and_the_Crow.txt with score value : 0.0526

Most similar document to The_Fox_and_the_Crow.txt based on intersection score is The_Man_and_the_Serpent.txt with score value : 0.0909

Most similar document to The_Frogs_Desiring_a_King.txt based on intersection score is The_Lion_and_the_Mouse.txt with score value : 0.0966

Most similar document to The_Lions_Share.txt based on intersection score is The_Lion_and_the_Mouse.txt with score value : 0.1043

Most similar document to The_Lion_and_the_Mouse.txt based on intersection score is The_Man_and_the_Serpent.txt with score value : 0.1182

Most similar document to The_Man_and_the_Ser

# approach 2: using tfidf cosine similarity

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
def tfidf_similarity(to_compare_doc, all_docs,j):
    vectorizer = TfidfVectorizer()

    #Combine the documents.
    all_docs.insert(0, to_compare_doc)
    embeddings = vectorizer.fit_transform(all_docs)

    cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[1:]).flatten()

    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(cosine_similarities):
        if highest_score < score:
            highest_score = score
            highest_score_index = i


    most_similar_document = documents_fable.index(all_docs[highest_score_index+1])
    
    print("Very similar document to "+ fables_files_name[j]+" by TF-IDF is "+fables_files_name[most_similar_document]+" with the score ", highest_score)


In [18]:
for i in range(12):
    all_docs = []
    for j in range(12):
        if(i!=j):
            all_docs.append(documents_fable[j])    
    tfidf_similarity(documents_fable[i],all_docs,i)

Very similar document to The_Ass_and_the_Lapdog.txt by TF-IDF is The_Sick_Lion.txt with the score  0.38349051245492927
Very similar document to The_Cock_and_the_Pearl.txt by TF-IDF is The_Wolf_and_the_Crane.txt with the score  0.2467043574020293
Very similar document to The_Dog_and_the_Shadow.txt by TF-IDF is The_Wolf_and_the_Crane.txt with the score  0.30592873783842783
Very similar document to The_Fox_and_the_Crow.txt by TF-IDF is The_Lions_Share.txt with the score  0.31079713181798
Very similar document to The_Frogs_Desiring_a_King.txt by TF-IDF is The_Lions_Share.txt with the score  0.30538689538476904
Very similar document to The_Lions_Share.txt by TF-IDF is The_Sick_Lion.txt with the score  0.3751967072875698
Very similar document to The_Lion_and_the_Mouse.txt by TF-IDF is The_Sick_Lion.txt with the score  0.3873801296708266
Very similar document to The_Man_and_the_Serpent.txt by TF-IDF is The_Lion_and_the_Mouse.txt with the score  0.3127439040993012
Very similar document to The_

# approach 3: using bert algorith of sentence transformers

In [19]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [30]:
def bert_similar(to_compare_doc, all_docs,j):
    vectorizer = TfidfVectorizer()

    #to_compare document
    sentences = sent_tokenize(to_compare_doc)
    to_compare_embeddings_sentences = model.encode(sentences)
    to_compare_embeddings = np.mean(np.array(to_compare_embeddings_sentences), axis=0)
    
    vectors = []
    for i, doc in enumerate(all_docs):

        sentences = sent_tokenize(doc)
        embeddings_sentences = model.encode(sentences)
        embeddings = np.mean(np.array(embeddings_sentences), axis=0)

        vectors.append(embeddings)

    cosine_similarities = cosine_similarity([to_compare_embeddings], vectors).flatten()

    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(cosine_similarities):
        if highest_score < score:
            highest_score = score
            highest_score_index = i


    most_similar_document = documents_fable.index(all_docs[highest_score_index])
    
    print("Very similar document to "+ fables_files_name[j]+" by bert is "+fables_files_name[most_similar_document]+" with the score ", highest_score)


In [31]:
for i in range(12):
    all_docs = []
    for j in range(12):
        if(i!=j):
            all_docs.append(documents_fable[j])    
    bert_similar(documents_fable[i],all_docs,i)

Very similar document to The_Ass_and_the_Lapdog.txt by bert is The_Wolf_and_the_Crane.txt with the score  0.6624953
Very similar document to The_Cock_and_the_Pearl.txt by bert is The_Fox_and_the_Crow.txt with the score  0.7921164
Very similar document to The_Dog_and_the_Shadow.txt by bert is The_Wolf_and_the_Crane.txt with the score  0.7620533
Very similar document to The_Fox_and_the_Crow.txt by bert is The_Town_Mouse_and_the_Country_Mouse.txt with the score  0.8256444
Very similar document to The_Frogs_Desiring_a_King.txt by bert is The_Sick_Lion.txt with the score  0.79758346
Very similar document to The_Lions_Share.txt by bert is The_Sick_Lion.txt with the score  0.84790856
Very similar document to The_Lion_and_the_Mouse.txt by bert is The_Town_Mouse_and_the_Country_Mouse.txt with the score  0.8127598
Very similar document to The_Man_and_the_Serpent.txt by bert is The_Sick_Lion.txt with the score  0.82803726
Very similar document to The_Sick_Lion.txt by bert is The_Lions_Share.txt w

# using keras tensorflow

In [27]:
from tensorflow.keras.preprocessing.text import one_hot

In [28]:
voc_size = 10000

I thought to try tensorflow, but there is no label available to classify the text

# Conclusion

## I have tried 3 different approaches:
### Approach 1: to check the jaccard score: It is score range from 0 to1 using below formula: intersection(a,b)/union(a,b) where a and b are 2 different documents

Limitation of this approach is it is just checking common words, semantic meaning is missing

### Approach 2: to check the td idf cosine similarity score: check angle between 2 vectors. 

cosine similarity is good when duplication in data matters.

### Approach 3 is the recommended one: used bert : its best method in NLP to understand context heavy texts, Domain knowledge is also considered while selecting this approach

Recommended content: 

who viewed The_Ass_and_the_Lapdog should be recommended- The_Wolf_and_the_Crane, The_Lions_Share, The_Sick_Lion

who viewed The_Cock_and_the_Pearl should be recommended- The_Fox_and_the_Crow, The_Town_Mouse_and_the_Country_Mouse

who viewed The_Dog_and_the_Shadow should be recommended- The_Wolf_and_the_Crane, The_Lions_Share, The_Sick_Lion

who viewed The_Fox_and_the_Crow should be recommended- The_Town_Mouse_and_the_Country_Mouse

who viewed The_Frogs_Desiring_a_King should be recommended- The_Sick_Lion, The_Lions_Share

who viewed The_Lions_Share should be recommended- The_Sick_Lion

who viewed The_Lion_and_the_Mouse should be recommended- The_Town_Mouse_and_the_Country_Mouse, The_Fox_and_the_Crow

who viewed The_Man_and_the_Serpent should be recommended- The_Sick_Lion, The_Lions_Share

who viewed The_Sick_Lion should be recommended- The_Lions_Share

who viewed The_Town_Mouse_and_the_Country_Mouse should be recommended- The_Fox_and_the_Crow

who viewed The_Wolf_and_the_Crane should be recommended- The_Lions_Share, The_Sick_Lion

who viewed The_Wolf_and_the_Lamb should be recommended- The_Sick_Lion, The_Lions_Share