# Cosine Similarity

## Dependances

In [103]:
import re
import os 
import sys
from urllib.request import urlopen
import requests
import pprint
import math


In [104]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.grammar import DependencyGrammar
from nltk.parse import CoreNLPParser
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer



In [105]:
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns

In [106]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Functions

In [107]:
def get_text(stemmer_on=False):
    with open("../data/t1.txt") as f:   
        text = f.read()

    sentences=sent_tokenize(text.lower())
    if stemmer_on==True:
        p=LancasterStemmer()
        # l=WordNetLemmatizer()
        stemmed_sentences=[]
        for a in sentences:
            stemmed_sentences.append(' '.join([p.stem(w) for w in word_tokenize(a)]))
            # stemmed_sentences.append(' '.join([l.lemmatize(w,pos='v') for w in word_tokenize(a)]))
            # print(stemmed_sentences)
            sentences=stemmed_sentences

    print(f'Number of sentences:{len(sentences)}')

    return sentences

In [108]:
def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

In [109]:
def get_dataframe(data,headers):

    df = pd.DataFrame(data)
    df.columns=headers

    return df



## Cosine similarity of 2 vectors

In [110]:
d1 = (5,0,5,6,3,9,8,7,5,6)
d2 = (3,0,2,4,6,9,8,5,2,1)
d1 = np.array(d1)
d2 = np.array(d2)
print(d1)
print(d2)


[5 0 5 6 3 9 8 7 5 6]
[3 0 2 4 6 9 8 5 2 1]


In [111]:
cosine_similarity(d1, d2)

0.9074362105351957

## Finding cosine similarity between documents in a corpus

In [112]:
text = get_text(stemmer_on=False)

Number of sentences:8


In [113]:
corpus = pd.Series(text)
corpus

0     assy5.
1    cell21.
2      pit2.
3       dc2.
4        sg.
5    cell20.
6      pit5.
7      pit4.
dtype: object

In [114]:
# vectorizer = CountVectorizer(stop_words='english')
# bow_matrix = vectorizer.fit_transform(corpus)

In [115]:
vectorizer=TfidfVectorizer(norm='l2',stop_words='english',ngram_range=(1,1))
bow_matrix=vectorizer.fit_transform(corpus)

In [116]:
bow_matrix
print(type(bow_matrix))

<class 'scipy.sparse.csr.csr_matrix'>


In [117]:
feature_names_count = vectorizer.get_feature_names()
feature_names_count



['assy5', 'cell20', 'cell21', 'dc2', 'pit2', 'pit4', 'pit5', 'sg']

In [118]:
features_array_count = bow_matrix.toarray()
features_array_count

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.]])

In [119]:
bow_matrix.shape

(8, 8)

In [120]:
cs_data=[]
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        cs=cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j])
        cs_angle=round(math.degrees(math.acos(cs)),3
        )

        cs_data.append((i,j,cs_angle))

        # print(f"The cosine similarity between the documents {i}, and {j} is: {cs} (angle: {cs_angle} degrees)")





In [121]:
cs_df=get_dataframe(cs_data,['sentence_a','sentence_b', 'cs_angle'])

cs_df.sort_values(by=['sentence_a','cs_angle'], inplace=True)
# cs_df[(cs_df.sentence_b>2) & (cs_df.sentence_a<3)]
# cs_df[(cs_df.cs_angle<90.00) & (cs_df.sentence_b>17) & (cs_df.sentence_a<=17)]
# cs_df[(cs_df.cs_angle<90.00)]

cs_df

Unnamed: 0,sentence_a,sentence_b,cs_angle
0,0,1,90.0
1,0,2,90.0
2,0,3,90.0
3,0,4,90.0
4,0,5,90.0
5,0,6,90.0
6,0,7,90.0
7,1,2,90.0
8,1,3,90.0
9,1,4,90.0


In [122]:
corpus[cs_df.sentence_b]


1    cell21.
2      pit2.
3       dc2.
4        sg.
5    cell20.
6      pit5.
7      pit4.
2      pit2.
3       dc2.
4        sg.
5    cell20.
6      pit5.
7      pit4.
3       dc2.
4        sg.
5    cell20.
6      pit5.
7      pit4.
4        sg.
5    cell20.
6      pit5.
7      pit4.
5    cell20.
6      pit5.
7      pit4.
6      pit5.
7      pit4.
7      pit4.
dtype: object