In [35]:
# Clustering requires a pre trained language model
#!pip install  https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz

In [2]:
import requests
from bs4 import BeautifulSoup
import spacy
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
from sklearn.decomposition import PCA
import plotly.express as px
import os

# Download text from a website

In [3]:
r = requests.get('https://www.bbc.co.uk/news')

In [4]:
r.status_code

200

In [5]:
r.encoding

'utf-8'

In [6]:
soup = BeautifulSoup(r.content, 'html.parser')

In [7]:
text = soup.get_text()

In [8]:
def get_text(url):
    r = requests.get(url)
    if r.status_code != 200:
        return('Status code: ' + str(r.status_code))
    text = BeautifulSoup(r.content, 'html.parser').get_text()
    return(text)

In [9]:
# Test some common sites
urls = ['https://www.bbc.co.uk/news',
       'https://en.wikipedia.org/wiki/2019%E2%80%9320_coronavirus_pandemic',
       'https://www.amazon.co.uk/',
       'https://towardsdatascience.com/',
       'https://stackoverflow.com/',
       'https://www.reddit.com/',
       'https://www.hotukdeals.com/',
       'https://www.gov.uk/',
       'https://www.fast.ai/',
       'https://www.crummy.com/software/BeautifulSoup/bs4/doc/',
       'https://spacy.io/'
       ]

texts = [get_text(url) for url in urls]

In [10]:
for ix, text in enumerate(texts):
    print(urls[ix], len(text))

https://www.bbc.co.uk/news 19792
https://en.wikipedia.org/wiki/2019%E2%80%9320_coronavirus_pandemic 269550
https://www.amazon.co.uk/ 16
https://towardsdatascience.com/ 2510
https://stackoverflow.com/ 10342
https://www.reddit.com/ 16
https://www.hotukdeals.com/ 13181
https://www.gov.uk/ 6769
https://www.fast.ai/ 96422
https://www.crummy.com/software/BeautifulSoup/bs4/doc/ 105211
https://spacy.io/ 4938


# Word Count using Spacy

In [11]:
nlp = spacy.load('en_core_web_md')

In [12]:
def word_count(text):
    doc = nlp(text)
    words = [token.text for token in doc if token.is_punct != True]
    counts = pd.Series(words).value_counts()
    return(counts)

In [13]:
counts = [word_count(t) for t in texts]

In [14]:
for ix, c in enumerate(counts):
    print(urls[ix])
    print(c.head(10), '\n')

https://www.bbc.co.uk/news
the        110
in          69
to          62
for         59
hours       58
of          50
are         37
minutes     35
is          29
a           25
dtype: int64 

https://en.wikipedia.org/wiki/2019%E2%80%9320_coronavirus_pandemic
\n\n         2004
2020         1456
the          1200
^            1003
of            947
and           741
in            717
Retrieved     677
to            654
\n            535
dtype: int64 

https://www.amazon.co.uk/
503       1
Status    1
code      1
dtype: int64 

https://towardsdatascience.com/
and     11
Data    11
        11
in       9
of       8
the      7
to       6
a        6
for      6
I        4
dtype: int64 

https://stackoverflow.com/
\n          128
and          35
your         28
Stack        28
Overflow     23
\n\n         21
for          19
the          18
with         17
to           17
dtype: int64 

https://www.reddit.com/
401       1
Status    1
code      1
dtype: int64 

https://www.hotukdeals.com/
£      

# Cluster words using vector representation after passing through a language model embedding

In [15]:
bbc = nlp(texts[0])

In [16]:
words = [w for w in bbc if  w.is_stop != True and w.is_punct != True and w.text not in ['\n', '\n\n' ]]
words_str = np.array([w.text for w in words])
words_vec = np.array([w.vector for w in words])
words_vec.shape, words_str.shape

((1799, 300), (1799,))

In [17]:
zero_wv = words_vec.sum(axis=1)==0.0

# unknown words
words_str_u = words_str[zero_wv]
# words for cluster
words_str = words_str[zero_wv==False]
words_vec = words_vec[zero_wv==False]
len(words_str), len(words_str_u)

(1310, 489)

In [18]:
n_clusters = 5
# Leave default settings for now
kmeans = KMeans(n_clusters=n_clusters).fit(words_vec)
centroids = kmeans.cluster_centers_
prediction = kmeans.predict(words_vec)
centroids.shape, prediction.shape


((5, 300), (1310,))

In [19]:
# Calculate distance to centoid
dists = []
for ix, wv in enumerate(words_vec):
    p = prediction[ix]
    centroid = centroids[p]
    distance = euclidean(wv, centroid)
    dists.append(distance)
    

In [20]:
results = pd.DataFrame({
    'word': words_str,
    'cluster': prediction,
    'distance_from_cluster_centre': dists
})
results.head()

Unnamed: 0,word,cluster,distance_from_cluster_centre
0,Home,4,4.782667
1,BBC,0,4.846796
2,News,4,5.184119
3,BBC,0,4.846796
4,BBC,0,4.846796


In [21]:
results = results.groupby(['word','cluster', 'distance_from_cluster_centre']).size().reset_index(name='word_count').sort_values(['word_count'], ascending=False)

In [22]:
unknowns = pd.DataFrame({
    'word': words_str_u,
    'cluster': -1,
    'distance_from_cluster_centre': 0.0
})
unknowns = unknowns.c.sort_values(['word_count'], ascending=False)

In [23]:
output = pd.concat([results, unknowns], axis=0)
output.shape

(588, 4)

# Visualise the clusters

Use PCA to reduce the vector down to 2 dimensions from 300 so that they can be visualised

In [24]:
pca = PCA(n_components=2).fit_transform(words_vec)
pca.shape

(1310, 2)

In [29]:
pca_df = pd.DataFrame({
    'x': pca[:,0],
    'y': pca[:,1],
    'cluster': prediction.astype(str),
    'word': words_str
}).drop_duplicates().sort_values('cluster')
pca_df.shape

(359, 4)

In [34]:
fig = px.scatter(pca_df, x="x", y="y", color="cluster",
                 hover_data=['word'],
                 color_discrete_sequence=px.colors.qualitative.Dark24)
fig.show()

# Refactor

In [None]:
def get_text(url):
    print('Downloading text')
    r = requests.get(url)
    if r.status_code != 200:
        return('Status code: ' + str(r.status_code))
    text = BeautifulSoup(r.content, 'html.parser').get_text()
    return(text)

def nlp_process(text):
    print('Processing text')
    nlp = spacy.load('en_core_web_md', 
        disable=['tagger', 'parser', 'ner']) # We only need tokeniser and the emdeddings
    doc = nlp(text)
    words = [w for w in doc if  w.is_stop != True and w.is_punct != True and w.text not in ['\n', '\n\n' ]]
    words_str = np.array([w.text for w in words])
    words_vec = np.array([w.vector for w in words]) 
    return(words_str, words_vec)

def clustering(X, n):
    print('Clustering words')
    # Handle words not in the vocab that have all zero word vector
    zero_wv = X.sum(axis=1)==0.0
    valid_vectors = X[zero_wv==False]
    kmeans = KMeans(n_clusters=n).fit(valid_vectors)
    centroids = kmeans.cluster_centers_
    clusters = kmeans.predict(valid_vectors)
    distances = []
    for ix, v in enumerate(valid_vectors):
        c = clusters[ix]
        centroid = centroids[c]
        distance = euclidean(v, centroid)
        distances.append(distance)
    # Fill in unkown words as their own cluster -1
    all_clusters = np.full((len(zero_wv)),-1)
    all_clusters[zero_wv==False] = clusters
    all_distances = np.full((len(zero_wv)), 0.0)
    all_distances[zero_wv==False] = distances
    return(all_clusters, all_distances)

def construct_df(url, n_clusters):
    text = get_text(url)
    words_str, words_vec = nlp_process(text)
    clusters, distances = clustering(words_vec, n_clusters)
    pca = PCA(n_components=2).fit_transform(words_vec)
    print('Finalising output')
    df = pd.DataFrame({
        'word': words_str,
        'cluster': clusters,
        'distance_to_cluster_centroid': distances,
        'pca_x': pca[:,0],
        'pca_y': pca[:,1]
        })
    grp = df.groupby(['word','cluster', 'distance_to_cluster_centroid', 'pca_x', 'pca_y'])
    agg = grp.size().reset_index(name='word_count').sort_values('word_count', ascending=False)
    # re-order columns
    agg = agg[['word', 'word_count', 'cluster', 'distance_to_cluster_centroid', 'pca_x', 'pca_y']]
    return(agg)

def cluster_plot(df, path):
    df = df.sort_values('cluster')
    df['cluster'] = df['cluster'].astype(str)
    fig = px.scatter(df, x="pca_x", y="pca_y", color="cluster",
                 hover_data=['word', 'word_count'],
                 color_discrete_sequence=px.colors.qualitative.Dark24)
    fig.write_html(path + 'plot.html')


In [None]:
url = 'https://www.bbc.co.uk/news'
n = 10
path = 'output/'

if path[-1] != '/':
    path = path + '/'
if os.path.exists(path) != True:
    os.mkdir(path)

df = construct_df(url, n)
df.to_csv(path+'data.csv')
cluster_plot(df, path)