# Chapter 10: Sample Notebook

This notebook contains all code from Chapter 10: _Measuring Text Similarity_.

## 10.2 Text Similarity Measure for Long Text: Cosine Similarity

### 10.2.2 Representing text as a vector in Python

In [1]:
import nltk
#download NLTK's stopwords module
nltk.download('stopwords')
#downlod NLTK's punkt module
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/roman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/roman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Python includes a collection of all punctuation 
# characters
from string import punctuation

# add apostrophe to the punctuation character list
punctuation_w_apostrophe = punctuation + "’"

# print all characters
print(punctuation_w_apostrophe)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~’


In [3]:
# imports word tokenizer from NLTK
from nltk import word_tokenize
# imports list of stop words from NLTK
from nltk.corpus import stopwords
# imports Porter Stemmer module from NLTK
from nltk.stem import PorterStemmer

# creates a list of English stop words
set_stopwords = set(stopwords.words('english'))
# creates a Porter stemmer object
stemmer = PorterStemmer()

# creates a custom tokenizer that removes stop words, 
# punctuation, and stems the remaining words
def custom_tokenizer(text:str):
    # gets all tokens (words) from the lower-cased 
    # input text
    tokens = word_tokenize(text.lower())
    # filters out stop words
    no_sw_tokens = [t for t in tokens 
                    if t not in set_stopwords]
    # filters out punctuation character tokens
    no_sw_punct_tokens = [t for t in no_sw_tokens 
                          if t not in 
                          punctuation_w_apostrophe]
    # stems the remaining words
    stem_tokens = [stemmer.stem(t) for t in 
                   no_sw_punct_tokens]
    # returns stemmed tokens (words)
    return stem_tokens

In [4]:
# excerpt from Verizon Communications Inc. 2018 10-K
doc_verizon = """Verizon Communications Inc. (Verizon or the Company) is a holding company that, acting through its subsidiaries, is one of the world’s leading providers of communications, information and entertainment products and services to consumers, businesses and governmental agencies."""
# excerpt from AT&T Inc. 2018 10-K
doc_att = """We are a leading provider of communications and digital entertainment services in the United States and the world. We offer our services and products to consumers in the U.S., Mexico and Latin America and to businesses and other providers of telecommunications services worldwide."""
# excerpt from Sprint Corporation 2018 10-K
doc_sprint = """Sprint Corporation, including its consolidated subsidiaries, is a communications company offering a comprehensive range of wireless and wireline communications products and services that are designed to meet the needs of individual consumers, businesses, government subscribers and resellers."""

tokens_verizon = custom_tokenizer(doc_verizon)
print(tokens_verizon)

tokens_att = custom_tokenizer(doc_att)
print(tokens_att)

tokens_sprint= custom_tokenizer(doc_sprint)
print(tokens_sprint)

['verizon', 'commun', 'inc.', 'verizon', 'compani', 'hold', 'compani', 'act', 'subsidiari', 'one', 'world', 'lead', 'provid', 'commun', 'inform', 'entertain', 'product', 'servic', 'consum', 'busi', 'government', 'agenc']
['lead', 'provid', 'commun', 'digit', 'entertain', 'servic', 'unit', 'state', 'world', 'offer', 'servic', 'product', 'consum', 'u.s.', 'mexico', 'latin', 'america', 'busi', 'provid', 'telecommun', 'servic', 'worldwid']
['sprint', 'corpor', 'includ', 'consolid', 'subsidiari', 'commun', 'compani', 'offer', 'comprehens', 'rang', 'wireless', 'wirelin', 'commun', 'product', 'servic', 'design', 'meet', 'need', 'individu', 'consum', 'busi', 'govern', 'subscrib', 'resel']


In [5]:
# CountVectorizer converts text to bag-of-words vectors
from sklearn.feature_extraction.text import CountVectorizer

# creates a list of three documents; one for each 
# company
documents = [doc_verizon,doc_att,doc_sprint]

# creates a CountVectorizer object with the custom 
# tokenizer
count_vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

# converts text documents to bag-of-word vectors
count_vecs = count_vectorizer.fit_transform(documents)

# prints first ten bag-of-words features (words)
print(count_vectorizer.get_feature_names()[:10])

# prints first ten bag-of-words elements (counts) for 
# each vector the output is a matrix where each row 
# represents a document vector the element (count) 
# order in each vector corresponds to the order of 
# the bag-of-word features
print(count_vecs.toarray()[:,:10]) 

['act', 'agenc', 'america', 'busi', 'commun', 'compani', 'comprehens', 'consolid', 'consum', 'corpor']
[[1 1 0 1 2 2 0 0 1 0]
 [0 0 1 1 1 0 0 0 1 0]
 [0 0 0 1 2 1 1 1 1 1]]


### 10.2.3 Calculating Cosine Similarity

In [6]:
# cosine_similarity calculates cosine similarity 
# between vectors
from sklearn.metrics.pairwise import cosine_similarity

# calculates text cosine similarity and stores results 
# in a matrix. The matrix stores pairwise similarity 
# scores for all documents, similarly to a covariance 
# matrix
cosine_sim_matrix = cosine_similarity(count_vecs)

# outputs the similarity matrix
print(cosine_sim_matrix)

[[1.         0.44854261 0.40768712]
 [0.44854261 1.         0.32225169]
 [0.40768712 0.32225169 1.        ]]


In [7]:
# TfidfVectorizer converts text to TF-IDF bag-of-words 
# vectors
from sklearn.feature_extraction.text import TfidfVectorizer

# creates a TfidfVectorizer object with the custom 
# tokenizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

# converts text documents to TF-IDF vectors
tfidf_vecs = tfidf_vectorizer.fit_transform(documents)

# prints first four bag-of-words features (words)
print(tfidf_vectorizer.get_feature_names()[:4])

# prints first four bag-of-words TF-IDF counts for each 
# vector. The output is a matrix where each row 
# represents a document vector
print(tfidf_vecs.toarray()[:,:4]) # prints first four elements of each vector

['act', 'agenc', 'america', 'busi']
[[0.22943859 0.22943859 0.         0.13551013]
 [0.         0.         0.23464902 0.13858749]
 [0.         0.         0.         0.13365976]]


In [8]:
# computes the cosine similarity matrix for TF-IDF 
# vectors
tfidf_cosine_sim_matrix = cosine_similarity(tfidf_vecs)
# outputs the similarity matrix
print(tfidf_cosine_sim_matrix)

[[1.         0.30593809 0.23499515]
 [0.30593809 1.         0.17890296]
 [0.23499515 0.17890296 1.        ]]


## 10.3 Text Similarity Measure for Short Text: Levenshtein Distance

### 10.3.1 Introducing the Levenshtein Distance

In [9]:
# edit_distance computes Levenshtein distance between 
# two pieces of text
from nltk import edit_distance

#example: account and accounts
print(edit_distance("account","accounts"))

#example: account and count
print(edit_distance("account","count"))

#example: account and access
print(edit_distance("account","access"))

1
2
4


### 10.3.2 Creating a Similarity Measure using the Levenshtein Distance

In [10]:
# similarity measure based on the Levenshtein distance
# greater values indicate more similar text
def edit_similarity(t1,t2):
    # lowercase the input strings
    (t1,t2) = (t1.lower(),t2.lower())
    # calculates the Levenshtein distance between the 
    # input strings
    distance = edit_distance(t1,t2)
    # calculates length of the longest input string
    longest_text_len = max(len(t1),len(t2))
    # if both t1 and t2 are empty strings, they are 
    # identical; thus return 1 as the output
    if longest_text_len == 0:
        return 1.0
    # else compute the similarity measure as 
    # 1 - (levenshtein_distance / length of the longest input string)
    else:
        return (1.0 - float(distance) / float(longest_text_len))

In [11]:
# original company name
orig_name = "Fidelity National Information Services"
# shortened company name
comp_name = "Fidelity National Info Svcs"

# calculates and outputs the Levenshtein distance
levenshtein_distance = edit_distance(orig_name,comp_name)
print("Levenshtein distance:",levenshtein_distance)

# calculates and output the similarity score based on 
# Levenshtein distance
levenshtein_similarity = edit_similarity(orig_name,comp_name)
print("Levenshtein similarity score:",levenshtein_similarity)

Levenshtein distance: 11
Levenshtein similarity score: 0.7105263157894737
