# 4. Analysis of SOU similarities

In [1]:
import json 
import numpy as np
import matplotlib.pyplot as plt
import string
import re
from math import log, pow
from sklearn.cluster import KMeans

In [2]:
with open('speeches.json', 'r') as f:
  speeches = json.loads(f.read())

#### (a) Compute the tf-idf vectors for each SOU address. You should lower case all of the text, and remove punctuation. You will have to make choices about the size of the term vocabulary to use—for example throwing out the 20 most common words, and words that appear fewer than, say, 50 times.

In [3]:
def clean_and_split(s):
    s = s.lower().replace('-',' ').translate(str.maketrans('', '', string.punctuation))
    s = re.sub('(\r\n)+',' ',s)
    s = re.sub(" +",' ',s.strip())
    return s.split(' ')

text = np.array([clean_and_split(s['text']) for s in speeches])

In [4]:
# Count the number of appearances for each word in each document and 
# the number of documents with each word.
vcount = {}
dcount = []

for d in text:
    c = {}
    for w in d:
        if w not in c.keys():
            c[w] = 0
        c[w] += 1
    dcount.append(c)
    for k in c.keys():
        if k not in vcount.keys():
            vcount[k] = 0
        vcount[k] +=1  
        

In [5]:
# Remove all words in less than 50 documents and the 20 most common words.
vocab = list(filter(lambda w: vcount[w] >= 50, list(vcount.keys())))
for key, value in sorted(vcount.items(), key=lambda item: item[1], reverse = True)[:20]:
    vocab.remove(key)

In [6]:
D = len(dcount)
def getScore(v,d):
    nid = d[v] if v in d.keys() else 0
    return nid * log(D/vcount[v])
scores = np.array([[getScore(v,d) for v in vocab] for d in dcount])

In [7]:
print("The TF-IDF vectors are:")
print(scores)
print("They are found in the variable \"scores\".")

The TF-IDF vectors are:
[[ 1.03608355  0.36713734  0.36818607 ...  0.          0.
   0.        ]
 [ 0.34536118  0.22028241  0.55227911 ...  0.          0.
   0.        ]
 [ 0.34536118  0.51399228  0.36818607 ...  0.          0.
   0.        ]
 ...
 [ 0.34536118  0.22028241  0.         ...  1.15785512 31.5412979
   7.15775476]
 [ 0.69072237  0.36713734  0.36818607 ...  3.47356537 42.89616514
   4.29465286]
 [ 0.69072237  0.36713734  0.36818607 ...  3.47356537 35.32625365
   4.29465286]]
They are found in the variable "scores".


#### (b)  In terms of this similarity measure, find the 
• 50 most similar pairs of SOUs given by different Presidents. <br>
• 50 most similar pairs of SOUs given by the same President. <br>
• 25 most similar pairs of Presidents, averaging the cosine similarity over all pairs of their SOUs. 

When you read the above speeches, do they indeed seem similar to you? Comment on what you find, and describe what is needed to construct a better similarity measure between documents.

In [8]:
def calcSim(u,v):
    un = np.linalg.norm(u)
    vn = np.linalg.norm(v)
    return u.dot(v)/(un*vn)
sim = [[calcSim(u,v) for v in scores] for u in scores]

In [9]:
pairs = []
for i in range(len(speeches)):
    for j in range(i+1,len(speeches)):
        pairs.append((i,j, sim[i][j]))
sorted_pairs = sorted(pairs, key=lambda p: p[2], reverse=True)

In [10]:
sorted_pairs_diff_pres = list(filter(lambda p: 
                              speeches[p[0]]['president'] != speeches[p[1]]['president'], 
                              sorted_pairs))[:50]
sorted_pairs_same_pres = list(filter(lambda p: 
                              speeches[p[0]]['president'] == speeches[p[1]]['president'], 
                              sorted_pairs))[:50]

In [11]:
pres_speeches = {}
for (i, s) in enumerate(speeches):
    if s['president'] not in pres_speeches.keys():
        pres_speeches[s['president']] = []
    pres_speeches[s['president']].append(i)
    
def getPresSim(p1,p2):
    sum = 0
    count = 0
    for s1 in pres_speeches[p1]:
        for s2 in pres_speeches[p2]:
            sum += sim[s1][s2]
            count += 1
    return sum/count

pres_sim = []

for (i,p1) in enumerate(pres_speeches.keys()):
    for p2 in list(pres_speeches.keys())[i+1:]:
        if p1 != p2:
            pres_sim.append((p1, p2, getPresSim(p1,p2)))
pres_sim = sorted(pres_sim, key=lambda p: p[2], reverse=True)[:25]

In [12]:
print("The 50 most similar pairs of SOUs given by different Presidents are:")
for p in sorted_pairs_diff_pres:
    print(speeches[p[0]]['president'] + " in " + speeches[p[0]]['year'] 
        + " and " + 
          speeches[p[1]]['president'] + " in " + speeches[p[1]]['year']
         + " score:", p[2])

print(" ")
print("The 50 most similar pairs of SOUs given by the same Presidents are:")
for p in sorted_pairs_same_pres:
    print(speeches[p[0]]['president'] + " in " + speeches[p[0]]['year'] 
            + " and " + " in " + speeches[p[1]]['year'] + " score: ", p[2]) 
print(" ")   
print("The 25 most similar pairs of Presidents, averaging the cosine similarity over all pairs of their SOUs are:")
for p in pres_sim:
    print(p[0] + " and " + p[1] + " score:", p[2])



The 50 most similar pairs of SOUs given by different Presidents are:
Dwight D. Eisenhower in 1961 and Jimmy Carter in 1981 score: 0.6949597630564998
Grover Cleveland in 1885 and Benjamin Harrison in 1889 score: 0.6765190387882957
John Tyler in 1844 and James K. Polk in 1846 score: 0.6737679650974868
Dwight D. Eisenhower in 1956 and Jimmy Carter in 1981 score: 0.6642398303049182
William J. Clinton in 1994 and Barack Obama in 2010 score: 0.6617426735099406
Rutherford B. Hayes in 1877 and Grover Cleveland in 1885 score: 0.6535790336141586
Dwight D. Eisenhower in 1955 and Jimmy Carter in 1981 score: 0.6530775835598227
John Tyler in 1844 and James K. Polk in 1845 score: 0.6504169118490623
Andrew Jackson in 1836 and Martin Van Buren in 1839 score: 0.6493025320093996
Theodore Roosevelt in 1907 and William Howard Taft in 1912 score: 0.6446578474869404
William J. Clinton in 1998 and George W. Bush in 2004 score: 0.6429521710912769
George Bush in 1992 and William J. Clinton in 1994 score: 0.6421

When I read them, they do indeed seem similar. They are not super similar, but definitely have similar styles of rheteric and even protray the same ideas in some cases. However, the sentence structure is not too similar. This is what I would expect given the limitations of the similarity scores as the scores do not reflect the ordering of words at all.

#### (c) Using this vector representation, cluster the speeches using k-means.
The options here limit the number of iterations of kmeans to 50, the number of clusters to 10, the clusters are initialized randomly.<br>
Experiment with different number of clusters, and display the clusters obtained (in some manner that you choose). Comment on the clustering results, and whether or not the results are interpretable.

In [13]:
c=10
model = KMeans(n_clusters=c, max_iter=50)
sou_clust=model.fit(scores)
labels = model.predict(scores)
clen = []
for i in range(c):
    print("Cluster", i)
    cl = 0
    for c in np.where(labels == i)[0]:
        cl += 1
        print(speeches[c]['president'], "in", speeches[c]['year'])
    clen.append(cl)
        
        


Cluster 0
William J. Clinton in 1993
William J. Clinton in 1994
William J. Clinton in 1995
William J. Clinton in 1996
William J. Clinton in 1997
William J. Clinton in 1998
William J. Clinton in 1999
William J. Clinton in 2000
Barack Obama in 2009
Barack Obama in 2010
Barack Obama in 2011
Barack Obama in 2012
Barack Obama in 2013
Cluster 1
Franklin D. Roosevelt in 1945
Harry S Truman in 1947
Harry S Truman in 1948
Harry S Truman in 1949
Harry S Truman in 1950
Harry S Truman in 1951
Harry S Truman in 1952
Harry S Truman in 1953
Dwight D. Eisenhower in 1953
Dwight D. Eisenhower in 1954
Dwight D. Eisenhower in 1955
Dwight D. Eisenhower in 1956
Dwight D. Eisenhower in 1957
Dwight D. Eisenhower in 1958
Dwight D. Eisenhower in 1959
Dwight D. Eisenhower in 1960
Dwight D. Eisenhower in 1961
John F. Kennedy in 1961
John F. Kennedy in 1962
John F. Kennedy in 1963
Lyndon B. Johnson in 1964
Lyndon B. Johnson in 1965
Lyndon B. Johnson in 1966
Lyndon B. Johnson in 1967
Lyndon B. Johnson in 1968
Lyndo

Its clear that the kmeans algorithm tends to cluster the speeches with similar simularity scores together. After experimenting with different amounts of clusters, the speeches with the higher similarity scores that we determined earlier stay clustered when more clusters are added and the less similar speeches tend to split off. This is what I would expect to be the case, but it is cool to see how this occurs and how we can use kmeans to group the speeches in this sense. I found that 10 clusters was a good median for where the speeches in each cluster have a high enough corresponding similarity score.