In [1]:
#import libraries
import math
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from num2words import num2words
from nltk.tokenize import word_tokenize

In [2]:
# Import Module
import os

# Folder Path
path = r"C:\Users\dkish\OneDrive\Desktop\Text Similarity Check"

# Change the directory
os.chdir(path)

# Read text File
doc_list = []

def read_text_file(file_path):
    with open(file_path, 'r') as f:
        doc_list.append(f.read())
        
# iterate through all file
column_name = []

for file in os.listdir():
    # Check whether file is in text format or not
    file_name = file.split(".")
    column_name.append(file_name[0])
    
    if file.endswith(".txt"):
        file_path = f"{path}\{file}"
        print(file_path)
        
        # call read text file function
        read_text_file(file_path)

C:\Users\dkish\OneDrive\Desktop\Text Similarity Check\doc1.txt
C:\Users\dkish\OneDrive\Desktop\Text Similarity Check\doc2.txt
C:\Users\dkish\OneDrive\Desktop\Text Similarity Check\doc3.txt
C:\Users\dkish\OneDrive\Desktop\Text Similarity Check\doc4.txt


In [3]:
doc_list

['The best Italian restaurant enjoy the best pasta.',
 'American restaurant enjoy the best hamburger.',
 'Korean restaurant enjoy the best bibimbap.',
 'The best the best American restaurant.']

In [4]:
column_name

['doc1', 'doc2', 'doc3', 'doc4']

# PREPROCESSING

LOWERCASE

In [5]:
def convert_lower_case(data_list):
    doc_list = []
    for doc in data_list:
        doc = str(np.char.lower(doc))
        doc_list.append(doc)
    return(doc_list)

PUNCTUATION

In [6]:
def remove_punctuation(data_list):
    doc_list = []
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    
    for doc in data_list:
        for i in symbols:
            doc = str(np.char.replace(doc, i, ' '))
        doc_list.append(doc)
    return doc_list

APOSTROPHE

In [7]:
def remove_apostrophe(data_list):
    doc_list = []
    for doc in data_list:
        doc = str(np.char.replace(doc, "'", ""))
        doc_list.append(doc)
    return doc_list

SINGLE CHARACTERS

In [8]:
def remove_single_characters(data_list):
    
    doc_list = []
    single_word_list = []
    
    for doc in data_list:
        text_tokens = str(doc).split(" ")
        for w in text_tokens:
            if len(w) < 2:
                single_word_list.append(w)
        tokens_without_sw = [word for word in text_tokens if not word in single_word_list]
        filtered_sentence = (" ").join(tokens_without_sw)
        doc_list.append(filtered_sentence)
    return(doc_list)

CONVERT NUMBERS

In [9]:
def convert_numbers(data_list):
    
    doc_list = []
    for doc in data_list:
        for num in str(doc).split(" "):
            if num.isdigit():
                doc = str(np.char.replace(doc,num,num2words(num)))
        doc_list.append(doc)
    return doc_list

STOP WORDS

In [10]:
def remove_stop_words(data_list):
    stop_words = stopwords.words('english')
    
    doc_list = []
    for doc in data_list:
        text_tokens = str(doc).split(" ")
        tokens_without_sw = [word for word in text_tokens if not word in stop_words]
        filtered_sentence = (" ").join(tokens_without_sw)
        doc_list.append(filtered_sentence)
    return doc_list

STEMMING

In [11]:
def stemming(data_list):
    porter = PorterStemmer()
    
    doc_list = []
    for doc in data_list:
        for word in str(doc).split(" "):
            doc = str(np.char.replace(doc,word,porter.stem(word)))
        doc_list.append(doc)
    return (doc_list)

In [12]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = convert_numbers(data)
    data = remove_stop_words(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    
    return data

In [13]:
data1 = preprocess(doc_list)
print(data1)

['best italian restaur enjoy best pasta', 'american restaur enjoy best hamburg', 'korean restaur enjoy best bibimbap', 'best best american restaur']


# CALCULATION TF-IDF

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
vector_matrix = count_vectorizer.fit_transform(data1)
vector_matrix

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [15]:
tokens = count_vectorizer.get_feature_names()
tokens

['american',
 'best',
 'bibimbap',
 'enjoy',
 'hamburg',
 'italian',
 'korean',
 'pasta',
 'restaur']

In [16]:
vector_matrix.toarray()

array([[0, 2, 0, 1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1],
       [1, 2, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [17]:
def create_dataframe(matrix, tokens):

    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return(df)

In [18]:
create_dataframe(vector_matrix.toarray(),tokens)

Unnamed: 0,american,best,bibimbap,enjoy,hamburg,italian,korean,pasta,restaur
doc_1,0,2,0,1,0,1,0,1,1
doc_2,1,1,0,1,1,0,0,0,1
doc_3,0,1,1,1,0,0,1,0,1
doc_4,1,2,0,0,0,0,0,0,1


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_matrix = cosine_similarity(vector_matrix)

create_dataframe(cosine_similarity_matrix,column_name)

Unnamed: 0,doc1,doc2,doc3,doc4
doc_1,1.0,0.632456,0.632456,0.721688
doc_2,0.632456,1.0,0.6,0.730297
doc_3,0.632456,0.6,1.0,0.547723
doc_4,0.721688,0.730297,0.547723,1.0


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf_vect = TfidfVectorizer()
vector_matrix = Tfidf_vect.fit_transform(data1)

tokens = Tfidf_vect.get_feature_names()
create_dataframe(vector_matrix.toarray(),tokens)

Unnamed: 0,american,best,bibimbap,enjoy,hamburg,italian,korean,pasta,restaur
doc_1,0.0,0.537595,0.0,0.328778,0.0,0.515094,0.0,0.515094,0.268798
doc_2,0.49145,0.325285,0.0,0.397871,0.623342,0.0,0.0,0.0,0.325285
doc_3,0.0,0.303722,0.582021,0.371496,0.0,0.0,0.582021,0.0,0.303722
doc_4,0.559849,0.741117,0.0,0.0,0.0,0.0,0.0,0.0,0.370559


In [21]:
cosine_similarity_matrix = cosine_similarity(vector_matrix)
df = create_dataframe(cosine_similarity_matrix,column_name)
df

Unnamed: 0,doc1,doc2,doc3,doc4
doc_1,1.0,0.393119,0.367059,0.498026
doc_2,0.393119,1.0,0.345401,0.63675
doc_3,0.367059,0.345401,1.0,0.337641
doc_4,0.498026,0.63675,0.337641,1.0


In [22]:
def color(value):

  if value > 0.70:
    color = 'red'
  elif value < .40:
    color = 'green'
  else:
    color = 'grey'

  return 'color: %s' %color

In [23]:
df.style.applymap(color,subset=df.columns)

Unnamed: 0,doc1,doc2,doc3,doc4
doc_1,1.0,0.393119,0.367059,0.498026
doc_2,0.393119,1.0,0.345401,0.63675
doc_3,0.367059,0.345401,1.0,0.337641
doc_4,0.498026,0.63675,0.337641,1.0
