# Measuring the Similarity of Texts using TF-IDF

This notebook is modeled on the *Programming Historian* lesson [Understanding and Using Common Similarity Measures for Text Analysis](https://programminghistorian.org/en/lessons/common-similarity-measures) by John Ladd. Please visit this webpage for more explanation.



## I. Setup

### Ia. Import necessary libraries

In [1]:
import pathlib
from pathlib import Path
import glob 
import pandas as pd, numpy as np
from scipy.spatial.distance import pdist, squareform
import nltk
from nltk import RegexpTokenizer  
tokenizer = RegexpTokenizer(r'\w+')
from nltk.corpus import stopwords
stop = sorted(stopwords.words('english'))



## Ib. Read in text files and create a dataframe

In [None]:
textdir = Path("~/shared/RR-workshop-data/state-of-the-union-dataset/txt").expanduser() 
pathlist = sorted(textdir.glob('*.txt')) 

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
#n=50

txtList=[]
pathlist = sorted(textdir.glob('*.txt'))      # .glob only stores the pathlist temporarily (for some reason), so you need to call it again!2
for path in pathlist:
    fn=path.stem                       #stem returns the filename minus the ".txt" (file extension). 
    pres,year=fn.split("_")            # fn = "1794_Washington" becomes year = "1794" and pres = "Washington"
    with open(path,'r') as f:  
        text1 = f.read()                #opens each file and reads it in as "sotu"
    tokens=tokenizer.tokenize(text1)    # tokenizes "sotu"
    numtoks = len(tokens)             # counts the number of tokens in "sotu"
    ltokens_ns = [tok.lower() for tok in tokens if tok not in stop]
    txtList.append([pres, year, numtoks, tokens, ltokens_ns, text1])   #add this info for "sotu" to a running list for all sotu addresses
       

In [3]:
colnames=['pres','year','numtoks','tokens', 'ltoks_ns', 'fulltext']
textdf=pd.DataFrame(txtList, columns=colnames)  #places our completed list of SOTU info in a dataframe
textdf.head(10)                                #prints out the first 10 rows of this dataframe (the default value for head() is 5 rows)

Unnamed: 0,pres,year,numtoks,tokens,ltoks_ns,fulltext
0,Adams,1797,2060,"[Gentlemen, of, the, Senate, and, Gentlemen, o...","[gentlemen, senate, gentlemen, house, represen...",Gentlemen of the Senate and Gentlemen of the H...
1,Adams,1798,2218,"[Gentlemen, of, the, Senate, and, Gentlemen, o...","[gentlemen, senate, gentlemen, house, represen...",Gentlemen of the Senate and Gentlemen of the H...
2,Adams,1799,1505,"[Gentlemen, of, the, Senate, and, Gentlemen, o...","[gentlemen, senate, gentlemen, house, represen...",Gentlemen of the Senate and Gentlemen of the H...
3,Adams,1800,1374,"[Gentlemen, of, the, Senate, and, Gentlemen, o...","[gentlemen, senate, gentlemen, house, represen...",Gentlemen of the Senate and Gentlemen of the H...
4,Adams,1825,9091,"[Fellow, Citizens, of, the, Senate, and, of, t...","[fellow, citizens, senate, house, representati...",Fellow Citizens of the Senate and of the House...
5,Adams,1826,7852,"[Fellow, Citizens, of, the, Senate, and, of, t...","[fellow, citizens, senate, house, representati...",Fellow Citizens of the Senate and of the House...
6,Adams,1827,7064,"[Fellow, Citizens, of, the, Senate, and, of, t...","[fellow, citizens, senate, house, representati...",Fellow Citizens of the Senate and of the House...
7,Adams,1828,7398,"[Fellow, Citizens, of, the, Senate, and, of, t...","[fellow, citizens, senate, house, representati...",Fellow Citizens of the Senate and of the House...
8,Arthur,1881,3903,"[To, the, Senate, and, House, of, Representati...","[to, senate, house, representatives, united, s...",To the Senate and House of Representatives of ...
9,Arthur,1882,3157,"[To, the, Senate, and, House, of, Representati...","[to, senate, house, representatives, united, s...",To the Senate and House of Representatives of ...


In [11]:
textdf.sort_values(by = "year", ascending = False).head(10)

Unnamed: 0,pres,year,numtoks,tokens,ltoks_ns,fulltext
14,Biden,2023,9624,"[Mr, Speaker, Thank, you, You, can, smile, it,...","[mr, speaker, thank, you, smile, ok, thank, th...","Mr. Speaker. Thank you. You can smile, it's OK..."
13,Biden,2022,8137,"[Thank, you, all, very, very, much, Thank, you...","[thank, much, thank, please, thank, much, mada...","Thank you all very, very much. Thank you, plea..."
12,Biden,2021,8351,"[Thank, you, Thank, you, Thank, you, Good, to,...","[thank, thank, thank, good, back, as, mitch, c...",Thank you. Thank you. Thank you. Good to be ba...
213,Trump,2020,6474,"[Thank, you, very, much, Thank, you, Thank, yo...","[thank, much, thank, thank, much, madam, speak...",Thank you very much. Thank you. Thank you very...
212,Trump,2019,5777,"[Madam, Speaker, Mr, Vice, President, Members,...","[madam, speaker, mr, vice, president, members,...","Madam Speaker, Mr. Vice President, Members of ..."
211,Trump,2018,5204,"[Mr, Speaker, Mr, Vice, President, Members, of...","[mr, speaker, mr, vice, president, members, co...","Mr. Speaker, Mr. Vice President, Members of Co..."
210,Trump,2017,5095,"[Thank, you, very, much, Mr, Speaker, Mr, Vice...","[thank, much, mr, speaker, mr, vice, president...","Thank you very much. Mr. Speaker, Mr. Vice Pre..."
161,Obama,2016,5628,"[Mr, Speaker, Mr, Vice, President, Members, of...","[mr, speaker, mr, vice, president, members, co...","Mr. Speaker, Mr. Vice President, Members of Co..."
160,Obama,2015,6961,"[Mr, Speaker, Mr, Vice, President, Members, of...","[mr, speaker, mr, vice, president, members, co...","Mr. Speaker, Mr. Vice President, Members of Co..."
159,Obama,2014,7017,"[Mr, Speaker, Mr, Vice, President, Members, of...","[mr, speaker, mr, vice, president, members, co...","Mr. Speaker, Mr. Vice President, Members of Co..."


## II. Create a TF-IDF matrix

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer   ###

# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:                                               ###
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']      ###
    def __init__(self):                                             ###
        self.wnl = WordNetLemmatizer()                              ###
    def __call__(self, doc):                                        ###
        #return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]
        return [self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc) if t not in self.ignore_tokens]    ###
    
lemma_tokenizer = LemmaTokenizer()                                 ###
eng_stops = set(stopwords.words('english'))                        ###
lemma_stop = lemma_tokenizer(' '.join(eng_stops))   
tfidf_vectorizer3 = TfidfVectorizer(input = "filename", stop_words = lemma_stop, tokenizer = lemma_tokenizer)
tfidf_matrix = tfidf_vectorizer3.fit_transform(pathlist)




[[1.         0.43991929 0.403336   ... 0.19580329 0.18803809 0.20870311]
 [0.43991929 1.         0.36556119 ... 0.18971342 0.18305033 0.18292755]
 [0.403336   0.36556119 1.         ... 0.20212655 0.19737592 0.20642286]
 ...
 [0.19580329 0.18971342 0.20212655 ... 1.         0.40592595 0.31038614]
 [0.18803809 0.18305033 0.19737592 ... 0.40592595 1.         0.38413319]
 [0.20870311 0.18292755 0.20642286 ... 0.31038614 0.38413319 1.        ]]


In [None]:
#cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
#print(cosine_sim)

## III. Measuring similarity



In [6]:
tfidf_array = tfidf_matrix.toarray()

In [7]:
textnamelist = [path.stem for path in pathlist]
euclidean_distances = pd.DataFrame(squareform(pdist(tfidf_array)), index=textnamelist, columns=textnamelist)
print(euclidean_distances)

             Adams_1797  Adams_1798  Adams_1799  Adams_1800  Adams_1825  \
Adams_1797     0.000000    1.058377    1.092396    1.139157    1.140426   
Adams_1798     1.058377    0.000000    1.126445    1.171412    1.168071   
Adams_1799     1.092396    1.126445    0.000000    1.136175    1.197776   
Adams_1800     1.139157    1.171412    1.136175    0.000000    1.175189   
Adams_1825     1.140426    1.168071    1.197776    1.175189    0.000000   
...                 ...         ...         ...         ...         ...   
Wilson_1916    1.269018    1.277528    1.279998    1.255958    1.196267   
Wilson_1917    1.299209    1.288659    1.280381    1.286824    1.228714   
Wilson_1918    1.268225    1.273017    1.263229    1.265470    1.179491   
Wilson_1919    1.274333    1.278241    1.266984    1.273079    1.173514   
Wilson_1920    1.258012    1.278337    1.259823    1.260161    1.135219   

             Adams_1826  Adams_1827  Adams_1828  Arthur_1881  Arthur_1882  \
Adams_1797     1.14249

In [27]:
tgt = "Lincoln_1862"      #try plugging in the names of different SOTU addresses, to view possible choices, enter the following in a new code cell: `textnamelist`
top5_euclidean = euclidean_distances.nsmallest(10, tgt)[tgt][1:]
print(top5_euclidean)

Buchanan_1858    0.982063
Polk_1848        0.992604
Grant_1875       0.992755
Grant_1873       0.995843
Lincoln_1863     0.995844
Johnson_1867     0.998771
Buchanan_1860    0.999609
Buchanan_1857    1.001766
Johnson_1865     1.002945
Name: Lincoln_1862, dtype: float64


In [9]:
cosine_distances = pd.DataFrame(squareform(pdist(tfidf_array, metric='cosine')), index=textnamelist, columns=textnamelist)

top5_cosine = cosine_distances.nsmallest(6, tgt)[tgt][1:]
print(top5_cosine)

Reagan_1982     0.414717
Clinton_1993    0.426629
Carter_1978     0.426772
Reagan_1984     0.433686
Reagan_1985     0.447053
Name: Reagan_1983, dtype: float64


https://towardsdatascience.com/visualising-similarity-clusters-with-interactive-graphs-20a4b2a18534

In [41]:
import plotly.graph_objects as go
import networkx as nx
G = nx.to_networkx_graph(cosine_sim)

In [42]:

def create_node_trace(G):
    # collect node information from G to plot
    node_x = []
    node_y = []
    node_text = []
    node_color = []

    for i, node in enumerate(G.nodes(data=True)):
        # get node x,y position and store
        x, y = node[1]['pos']
        node_x.append(x)
        node_y.append(y)

        node_text.append(node[1]['text'])
        node_color.append(node[1]['color'])

    # create node trace (i.e., scatter plot)
    # make it invisible by default
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=False,
            color=node_color,
            size=16,
            line_width=0.5,
        ),
        text=node_text,
        visible=False
    )

    return node_trace



In [43]:
def create_edge_trace(G):
    # collect edges information from G to plot
    edge_weight = []
    edge_text = []
    edge_pos = []
    edge_color = []
    
    for edge in G.edges(data=True):
        
        # edge is line connecting two points
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_pos.append([[x0, x1, None], [y0, y1, None]])
        
        # edge line color when drawn
        edge_color.append("black")

    # there is a trace for each edge
    edge_traces = []
    for i in range(len(edge_pos)):
        
        # edge line width
        line_width = 1

        # is scatter because it is line connecting two points
        trace = go.Scatter(
            x=edge_pos[i][0], y=edge_pos[i][1],
            line=dict(width=line_width, color=edge_color[i]),
            mode='lines',
            visible=False
        )
        edge_traces.append(trace)

    return edge_traces

In [44]:
def filter_similarity_matrix_at_step(square_matrix, step_value):
    # copy matrix
    aux = square_matrix.copy()
    
    # set as NaN all values equal to or below threshold value
    aux[aux <= step_value] = np.nan
    
    # return filtered matrix
    return aux

In [45]:
def get_interactive_slider_similarity_graph(square_matrix, slider_values, node_text=None, yaxisrange=None, xaxisrange=None):
    
    # Create figure with plotly
    fig = go.Figure()

    # key: slider value
    # value: list of traces to display for that slider value
    slider_dict = {}
    
    # total number of traces
    total_n_traces = 0
    
    # node positions on plot
    #node_pos = None

    # for each possible value in the slider, create and store traces (i.e., plots)
    for i, step_value in enumerate(slider_values):

        # update similarity matrix for the current step
        aux = filter_similarity_matrix_at_step(square_matrix, step_value)

        # create nx graph from sim matrix
        G = nx.to_networkx_graph(aux)
        
        # remove edges for 0 weight (NaN)
        G.remove_edges_from([(a, b) for a, b, attrs in G.edges(data=True) if np.isnan(attrs["weight"])])

        # assign node positions if None
        #node_pos = nx.nx_pydot.graphviz_layout(G)   #caused DeprecationWarning and error!
        node_pos = nx.nx_agraph.graphviz_layout(G)

        # populate nodes with meta information
        for node in G.nodes(data=True):
            
            # node position
            node[1]['pos'] = node_pos[node[0]]

            # node color
            node[1]['color'] = "orange"

            # node text on hover if any is specified else is empty
            if node_text is not None:
                node[1]['text'] = node_text[node[0]]
            else:
                node[1]['text'] = ""

        # create edge taces (each edge is a trace, thus this is a list)
        edge_traces = create_edge_trace(G)
        
        # create node trace (a single trace for all nodes, thus it is not a list)
        node_trace = create_node_trace(G) 

        # store edge+node traces as single list for the current step value
        slider_dict[step_value] = edge_traces + [node_trace]
        
        # keep count of the total number of traces
        total_n_traces += len(slider_dict[step_value])

        # make sure that the first slider value is active for visualization
        if i == 0:
            for trace in slider_dict[step_value]:
                # make visible
                trace.visible = True

                
    # Create steps objects (one step per step_value)
    steps = []
    for step_value in slider_values:
        
        # count traces before adding new traces
        n_traces_before_adding_new = len(fig.data)
        
        # add new traces
        fig.add_traces(slider_dict[step_value])

        step = dict(
            # update figure when this step is active
            method="update",
            # make all traces invisible
            args=[{"visible": [False] * total_n_traces}],
            # label on the slider
            label=str(round(step_value, 3)),
        )

        # only toggle this step's traces visible, others remain invisible
        n_traces_for_step_value = len(slider_dict[step_value])
        for i in range(n_traces_before_adding_new, n_traces_before_adding_new + n_traces_for_step_value):
            step["args"][0]["visible"][i] = True
        
        # store step object in list of many steps
        steps.append(step)

    # create slider with list of step objects
    slider = [dict(
        active=0,
        steps=steps
    )]

    # add slider to figure and create layout
    fig.update_layout(
        sliders=slider,
        showlegend=False,
        hovermode='closest',
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(range=xaxisrange, showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(range=yaxisrange, showgrid=False, zeroline=False, showticklabels=False),
        width=700, height=700,
    )

    return fig

In [46]:
textnamelist = [pres + "_" + year for pres, yr in zip(textdf['pres'], textdf['year'])]

In [47]:

# define slider steps (i.e., threshold values)
slider_steps = np.arange(0.4, 0.85, 0.05)
    
# get the slider figure
fig = get_interactive_slider_similarity_graph(
    cosine_sim,
    slider_steps,
    node_text = textnamelist
)

# plot it
fig.show()

ImportError: requires pygraphviz http://pygraphviz.github.io/


https://pygraphviz.github.io/documentation/stable/install.html
```
python -m pip install --use-pep517 `
>>               --config-setting="--build-option=build_ext" `
>>               --config-setting="--build-option=-IC:\Program Files\Graphviz\include" `
>>               --config-setting="--build-option=-LC:\Program Files\Graphviz\lib" `
>>               pygraphviz
```