# Cosine Similarity

## Dependances

In [663]:
import re
import os 
import sys
from urllib.request import urlopen
import requests
import pprint
import math


In [664]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.grammar import DependencyGrammar
from nltk.parse import CoreNLPParser
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer



In [665]:
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns

In [666]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Functions

In [667]:
def get_text(stemmer_on=False):
    with open("../data/t1.txt") as f:   
        text = f.read()

    sentences=sent_tokenize(text.lower())
    if stemmer_on==True:
        p=LancasterStemmer()
        # l=WordNetLemmatizer()
        stemmed_sentences=[]
        for a in sentences:
            stemmed_sentences.append(' '.join([p.stem(w) for w in word_tokenize(a)]))
            # stemmed_sentences.append(' '.join([l.lemmatize(w,pos='v') for w in word_tokenize(a)]))
            # print(stemmed_sentences)
            sentences=stemmed_sentences

    print(f'Number of sentences:{len(sentences)}')

    return sentences

In [668]:
def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

In [669]:
def get_dataframe(data,headers):

    df = pd.DataFrame(data)
    df.columns=headers

    return df



## Cosine similarity of 2 vectors

In [670]:
d1 = (5,0,5,6,3,9,8,7,5,6)
d2 = (3,0,2,4,6,9,8,5,2,1)
d1 = np.array(d1)
d2 = np.array(d2)
print(d1)
print(d2)


[5 0 5 6 3 9 8 7 5 6]
[3 0 2 4 6 9 8 5 2 1]


In [671]:
cosine_similarity(d1, d2)

0.9074362105351957

## Finding cosine similarity between documents in a corpus

In [672]:
text = get_text(stemmer_on=True)

Number of sentences:5


In [673]:
corpus = pd.Series(text)
corpus

0            what is press of nitrog tank ?
1    what is max press in the hydrog tank ?
2           what is max press of pump h-6 ?
3                cur press of nitrog tank .
4                cur press of hydrog tank .
dtype: object

In [674]:
# vectorizer = CountVectorizer(stop_words='english')
# bow_matrix = vectorizer.fit_transform(corpus)

In [675]:
vectorizer=TfidfVectorizer(norm='l2',stop_words='english')
bow_matrix=vectorizer.fit_transform(corpus)

In [676]:
bow_matrix
print(type(bow_matrix))

<class 'scipy.sparse.csr.csr_matrix'>


In [677]:
feature_names_count = vectorizer.get_feature_names()
feature_names_count



['cur', 'hydrog', 'max', 'nitrog', 'press', 'pump', 'tank']

In [678]:
features_array_count = bow_matrix.toarray()
features_array_count

array([[0.        , 0.        , 0.        , 0.73792244, 0.43582888,
        0.        , 0.51528988],
       [0.        , 0.59376229, 0.59376229, 0.        , 0.35068557,
        0.        , 0.41462311],
       [0.        , 0.        , 0.58873218, 0.        , 0.34771471,
        0.72971837, 0.        ],
       [0.59376229, 0.        , 0.        , 0.59376229, 0.35068557,
        0.        , 0.41462311],
       [0.59376229, 0.59376229, 0.        , 0.        , 0.35068557,
        0.        , 0.41462311]])

In [679]:
bow_matrix.shape

(5, 7)

In [680]:
cs_data=[]
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        cs=cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j])
        cs_angle=round(math.degrees(math.acos(cs)),3)

        cs_data.append((i,j,cs_angle))

        # print(f"The cosine similarity between the documents {i}, and {j} is: {cs} (angle: {cs_angle} degrees)")





In [681]:
cs_df=get_dataframe(cs_data,['sentence_a','sentence_b', 'cs_angle'])
cs_df.sort_values(by=['sentence_a','cs_angle'], inplace=True)
cs_df[(cs_df.sentence_b>0) & (cs_df.sentence_a>=0)]
# cs_df[(cs_df.cs_angle<90.00) & (cs_df.sentence_b>0) & (cs_df.sentence_a==0)]
# cs_df[(cs_df.cs_angle<90.00)]

Unnamed: 0,sentence_a,sentence_b,cs_angle
2,0,3,36.424
0,0,1,68.501
3,0,4,68.501
1,0,2,81.284
6,1,4,49.651
4,1,2,61.868
5,1,3,72.849
7,2,3,82.996
8,2,4,82.996
9,3,4,49.651
