# Cosine Similarity

## Dependances

In [283]:
import re
import os 
import sys
from urllib.request import urlopen
import requests
import pprint
import math


In [284]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.grammar import DependencyGrammar
from nltk.parse import CoreNLPParser



In [285]:
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns

In [286]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Functions

In [287]:
def get_text():
    with open("../data/t1.txt") as f:   
        text = f.read()

    sentences=sent_tokenize(text)
    print(f'Number of sentences:{len(sentences)}')
    return sentences

In [288]:
def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

In [289]:
def get_dataframe(data,headers):

    df = pd.DataFrame(data)
    df.columns=headers

    return df



## Cosine similarity of 2 vectors

In [290]:
d1 = (5,0,5,6,3,9,8,7,5,6)
d2 = (3,0,2,4,6,9,8,5,2,1)
d1 = np.array(d1)
d2 = np.array(d2)
print(d1)
print(d2)


[5 0 5 6 3 9 8 7 5 6]
[3 0 2 4 6 9 8 5 2 1]


In [291]:
cosine_similarity(d1, d2)

0.9074362105351957

## Finding cosine similarity between documents in a corpus

In [292]:
text = get_text()

Number of sentences:10


In [293]:
corpus = pd.Series(text)
corpus

0    The oxygen tanks are made of lnconel (a nickel...
1    The cryogenic (ultra low temperature) tanks su...
2    The fuel sump tank occupies almost all of the ...
3    The Eagles are not going to the super bowl thi...
4           How do I purge the oxygen fuel cell lines?
5    Which breaker do I use to shutoff the fuel cel...
6    The Marine Corps has been part of the U.S. Dep...
7    From their inception during the American Revol...
8    The U.S. Armed Forces are considered the world...
9    The hydrogen tanks are made of brass and are a...
dtype: object

In [294]:
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(corpus)

In [295]:
bow_matrix
print(type(bow_matrix))

<class 'scipy.sparse.csr.csr_matrix'>


In [296]:
feature_names_count = vectorizer.get_feature_names()
feature_names_count



['1834',
 '26',
 '30',
 'alloy',
 'american',
 'armed',
 'bowl',
 'brass',
 'breaker',
 'cell',
 'considered',
 'control',
 'corps',
 'cryogenic',
 'decisive',
 'department',
 'diameter',
 'eagles',
 'environmental',
 'forces',
 'fuel',
 'going',
 'history',
 'hydrogen',
 'inception',
 'inches',
 'june',
 'lines',
 'little',
 'lnconel',
 'low',
 'marine',
 'military',
 'navy',
 'nickelsteel',
 'occupies',
 'oxygen',
 'played',
 'powerful',
 'powerplant',
 'powerplants',
 'purge',
 'revolutionary',
 'role',
 'sector',
 'service',
 'shutoff',
 'sister',
 'space',
 'states',
 'subsystem',
 'sump',
 'super',
 'supply',
 'tank',
 'tanks',
 'temperature',
 'ultra',
 'united',
 'use',
 'war',
 'world',
 'year']

In [297]:
features_array_count = bow_matrix.toarray()
features_array_count

array([[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,

In [298]:
bow_matrix.shape

(10, 63)

In [299]:
cs_data=[]
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        cs=cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j])
        cs_angle=round(math.degrees(math.acos(cs)),3)

        cs_data.append((i,j,cs_angle))

        print(f"The cosine similarity between the documents {i}, and {j} is: {cs} (angle: {cs_angle} degrees)")





The cosine similarity between the documents 0, and 1 is: 0.242535625036333 (angle: 75.964 degrees)
The cosine similarity between the documents 0, and 2 is: 0.0 (angle: 90.0 degrees)
The cosine similarity between the documents 0, and 3 is: 0.0 (angle: 90.0 degrees)
The cosine similarity between the documents 0, and 4 is: 0.14907119849998599 (angle: 81.427 degrees)
The cosine similarity between the documents 0, and 5 is: 0.0 (angle: 90.0 degrees)
The cosine similarity between the documents 0, and 6 is: 0.0 (angle: 90.0 degrees)
The cosine similarity between the documents 0, and 7 is: 0.0 (angle: 90.0 degrees)
The cosine similarity between the documents 0, and 8 is: 0.0 (angle: 90.0 degrees)
The cosine similarity between the documents 0, and 9 is: 0.629940788348712 (angle: 50.954 degrees)
The cosine similarity between the documents 1, and 2 is: 0.09901475429766744 (angle: 84.318 degrees)
The cosine similarity between the documents 1, and 3 is: 0.0 (angle: 90.0 degrees)
The cosine similari

In [300]:
cs_df=get_dataframe(cs_data,['sentence','sentence', 'cs_angle'])
cs_df.sort_values(by=['cs_angle'], inplace=True)
cs_df

Unnamed: 0,sentence,sentence.1,cs_angle
8,0,9,50.954
11,1,4,64.287
30,4,5,68.583
0,0,1,75.964
42,7,8,76.367
12,1,5,78.578
16,1,9,79.436
18,2,4,79.48
19,2,5,80.406
39,6,7,81.124
