# Measuring the Similarity of Texts using TF-IDF

This notebook is modeled on the *Programming Historian* lesson [Understanding and Using Common Similarity Measures for Text Analysis](https://programminghistorian.org/en/lessons/common-similarity-measures) by John Ladd. Please visit this webpage for more explanation.



## I. Setup

### Ia. Import necessary libraries

In [None]:
import pathlib
from pathlib import Path
import glob 
import pandas as pd, numpy as np
from scipy.spatial.distance import pdist, squareform
import nltk
from nltk import RegexpTokenizer  
tokenizer = RegexpTokenizer(r'\w+')
from nltk.corpus import stopwords
stop = sorted(stopwords.words('english'))



## Ib. Read in text files and create a dataframe

In [None]:
textdir = Path("~/shared/RR-workshop-data/state-of-the-union-dataset/txt").expanduser() 
pathlist = sorted(textdir.glob('*.txt')) 

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
#n=50

txtList=[]
pathlist = sorted(textdir.glob('*.txt'))      # .glob only stores the pathlist temporarily (for some reason), so you need to call it again!2
for path in pathlist:
    fn=path.stem                       #stem returns the filename minus the ".txt" (file extension). 
    pres,year=fn.split("_")            # fn = "1794_Washington" becomes year = "1794" and pres = "Washington"
    with open(path,'r') as f:  
        text1 = f.read()                #opens each file and reads it in as "sotu"
    tokens=tokenizer.tokenize(text1)    # tokenizes "sotu"
    numtoks = len(tokens)             # counts the number of tokens in "sotu"
    ltokens_ns = [tok.lower() for tok in tokens if tok not in stop]
    txtList.append([pres, year, numtoks, tokens, ltokens_ns, text1])   #add this info for "sotu" to a running list for all sotu addresses
       

In [None]:
colnames=['pres','year','numtoks','tokens', 'ltoks_ns', 'fulltext']
textdf=pd.DataFrame(txtList, columns=colnames)  #places our completed list of SOTU info in a dataframe
textdf.head(10)                                #prints out the first 10 rows of this dataframe (the default value for head() is 5 rows)

In [None]:
textdf.sort_values(by = "year", ascending = False).head(10)

## II. Create a TF-IDF matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer   ###

# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:                                               ###
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']      ###
    def __init__(self):                                             ###
        self.wnl = WordNetLemmatizer()                              ###
    def __call__(self, doc):                                        ###
        #return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]
        return [self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc) if t not in self.ignore_tokens]    ###
    
lemma_tokenizer = LemmaTokenizer()                                 ###
eng_stops = set(stopwords.words('english'))                        ###
lemma_stop = lemma_tokenizer(' '.join(eng_stops))   
tfidf_vectorizer3 = TfidfVectorizer(input = "filename", stop_words = lemma_stop, tokenizer = lemma_tokenizer)
tfidf_matrix = tfidf_vectorizer3.fit_transform(pathlist)


In [None]:
#cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
#print(cosine_sim)

## III. Measuring similarity



In [None]:
tfidf_array = tfidf_matrix.toarray()

In [None]:
textnamelist = [path.stem for path in pathlist]
euclidean_distances = pd.DataFrame(squareform(pdist(tfidf_array)), index=textnamelist, columns=textnamelist)
print(euclidean_distances)

In [None]:
tgt = "Lincoln_1862"      #try plugging in the names of different SOTU addresses, to view possible choices, enter the following in a new code cell: `textnamelist`
top5_euclidean = euclidean_distances.nsmallest(10, tgt)[tgt][1:]
print(top5_euclidean)

In [None]:
cosine_distances = pd.DataFrame(squareform(pdist(tfidf_array, metric='cosine')), index=textnamelist, columns=textnamelist)

top5_cosine = cosine_distances.nsmallest(6, tgt)[tgt][1:]
print(top5_cosine)