In [1]:
import scraper.guardian_dataset as corpus

In [2]:
import torch
from matplotlib import pyplot as plt
import numpy

In [3]:
#Just to grab a bunch of random words that might fit into groups
article = corpus.getArticle('https://www.theguardian.com/world/2020/sep/13/us-west-coast-choked-by-smoke-as-firefighters-tackle-deadly-wildfires')

In [5]:
vec_article = list()
for word in article:
    try:
        word = corpus.w2v[word]
        vec_article.append(torch.from_numpy(word))
    except KeyError:
        pass
article = vec_article
del vec_article

  vec_article.append(torch.from_numpy(word))


In [6]:
#Now comes time to rank each of the articles
ranked = list()
for word in article:
    sort_article = sorted(article, key=lambda w : torch.cosine_similarity(w, word, dim=0), reverse=False)
    ranked.append({"word":word, "rankings":sort_article})

In [7]:
#Lets prove the algo first:
test = ranked[0]

In [8]:
distances = [torch.cosine_similarity(w, test['word'], -1).item() for w in test['rankings']]
distances = [i  for i in filter(lambda x: x!= 1, distances)]#Get rid of the same word, cause that messes up the scale

In [9]:
#taking a quick look to prove that clear groupings exist
%matplotlib qt
#Kinda hard to see inline, need a larger output
plt.figure()
# plt.hlines(1, 1, max(distances)/2)
plt.eventplot(distances, orientation="horizontal")
plt.axis('off')
plt.show()

Cool, so its clear that there are groupings. Now how do we actually separate them? It is very easy to do visually, however, it is very difficult to make a general solution that will always work in a computer, as we have to assume inperfection

In order to break up the vectors, we can turn the above graph into a 2 dimensional graph, and use some hack calculus in order to figure out where the groups break off.

# Making frequency graph 2-D

In [10]:
delta_distance = [0]
#Forgive the bad code
for d in range(1, len(distances)):
    delta_distance.append(distances[d] - distances[d-1])
plt.figure()
plt.plot(delta_distance)
plt.ylabel('Change in distance')
plt.xlabel('Distance from word')

Text(0.5, 0, 'Distance from word')

Looking at the two above graphs right next to each other, one can see that there are obvious peaks in the 2-d graph between groups! 
Let's try to use these peaks to make groups

In [11]:
groups = list()
start_idx = 0
running_avg = [0 for i in range(6)]
for item in range(0, len(delta_distance)-2):
    running_avg[0:5] = running_avg[1:6]
    running_avg[5] = delta_distance[item+2] - delta_distance[item+1]
    
    if (sum(running_avg)/len(running_avg) < 0 and running_avg[5] < 0):
        groups.append(test['rankings'][start_idx:item])
        start_idx = item + 1

In [12]:
lengths = [len(group) for group in groups]

In [13]:
avg_len = sum(lengths)/len(lengths)
avg_len

4.6

# Thats a pretty good average size for groups, keep in mind that some words are going to be extraneous, and others are going to have more than average pairings

# Lets see how well these pairings are

In [14]:
sum_dist_total = torch.zeros(1)
counter_total = 0
check = torch.zeros(300) == torch.ones(300)
for group in groups:
    for item in group:
        for other in group:
            
            if not torch.all(torch.eq(item, other)):
                sum_dist_total += torch.cosine_similarity(item, other, -1)
                counter_total += 1
                

In [15]:
sum_dist_total/counter_total

tensor([0.2256])

# So the pairings work out pretty well


Now, lets look at the code to execute this on the entire dataset

In [16]:
#Flashing back to above, remember that "ranked" is a list of dictionaries with the properties 'word' and 'ranking'
#Note: for most uses of this algo, you will NOT need to do all of this, only some of the grouping stuff. Therefore it will not take as long to run

output = list()
for word in ranked:
    distance = [torch.cosine_similarity(word['word'], i, -1) for i in word['rankings']]
    delta_distance = [0]
    for d in range(1, len(distances)):
        delta_distance.append(distances[d] - distances[d-1])
    groups = list()
    start_idx = 0
    running_avg = [0 for i in range(6)]
    for item in range(0, len(delta_distance)-2):
        running_avg[0:5] = running_avg[1:6]
        running_avg[5] = delta_distance[item+2] - delta_distance[item+1]

        if (sum(running_avg)/len(running_avg) < 0 and running_avg[5] < 0):
            groups.append(test['rankings'][start_idx:item])
            start_idx = item + 1
    output.append(word.update({'groups':groups}))

# Using Grouping for text sumarization:

In [17]:
min_vec = torch.tensor([-1 for i in range(300)])
article_ranked = sorted(article, key=lambda w : torch.cosine_similarity(w, min_vec, dim=0), reverse=False)
distances = [torch.cosine_similarity(i, min_vec, dim=0) for i in article_ranked]
delta_distance = [0]
group_list = list()
for d in range(1, len(distances)):
    delta_distance.append(distances[d] - distances[d-1])
    groups = list()
    start_idx = 0
    running_avg = [0 for i in range(6)]
    for item in range(0, len(delta_distance)-2):
        running_avg[0:5] = running_avg[1:6]
        running_avg[5] = delta_distance[item+2] - delta_distance[item+1]


        if (sum(running_avg)/len(running_avg) < 0 and running_avg[5] < 0):
            groups.append(test['rankings'][start_idx:item])
            start_idx = item + 1
    group_list  += groups

We now have a list of groups of related words.
We can just now just take the words from each legit (len>1) group 
that have the highest similarity to all the other words in order to come up with a quick
summary of what happened, which can then be fed into some type of anaysis or generator algorithm