# Extract text from articles for clustering

In [1]:
##### CLUSTERING ON ALL ARTICLES ####
import os
from bs4 import BeautifulSoup
import pandas as pd

# get all html files from folder
html_files = [file for file in os.listdir("project_htmls") if file.endswith(".html")]

# extract text from each article
articles = []
count = 0
for article in html_files:
    # read html content of each article
    with open(f"project_htmls/{article}", "r+b") as fr:
        soup = BeautifulSoup(fr.read(), "html.parser")
        
        # get text of each article
        body = soup.find("div", {"class": "c-article-body u-clearfix"})
        # body will be None if article is in a different html format than usual
        if body is not None:
            text = [p.text.strip() for p in body.find_all("p")] # extract text from body
            text = " ".join(text) # turn text into 1 string instead of list of sentences
            articles.append(text)
            
            if count % 1000 == 0: 
                print(f"Extracted {count} articles.")
            count += 1

print(f"Extracted {count} total articles.")

# store in df
data = pd.DataFrame({"articles": articles})

Extracted 0 articles.
Extracted 1000 articles.
Extracted 2000 articles.
Extracted 3000 articles.
Extracted 3152 total articles.


In [2]:
print(data.head())
print(data.shape)

                                            articles
0  Coronaviruses take their name from their crown...
1  India has the world's second largest COVID-19 ...
2  This week, an antibiotic that targets hard-to-...
3  Voters in the Golden State passed Proposition ...
4  Ten years ago, dead fish began washing ashore ...
(3152, 1)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words="english", max_df=0.7)
X = vectorizer.fit_transform(data.articles)
X

<3152x58894 sparse matrix of type '<class 'numpy.float64'>'
	with 1065615 stored elements in Compressed Sparse Row format>

In [4]:
X.shape

(3152, 58894)

## K-Means Clustering 

In [5]:
# initializing a model with parameters
from sklearn.cluster import KMeans

# try k-clusters
k = 5
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans

KMeans(n_clusters=5, random_state=0)

In [6]:
%time kmeans.fit(X)

CPU times: user 50.8 s, sys: 676 ms, total: 51.5 s
Wall time: 6.71 s


KMeans(n_clusters=5, random_state=0)

## Adding Clusters to Dataframe

In [7]:
# add clusters to dataframe
data["cluster"] = kmeans.predict(X)
data

Unnamed: 0,articles,cluster
0,Coronaviruses take their name from their crown...,1
1,India has the world's second largest COVID-19 ...,1
2,"This week, an antibiotic that targets hard-to-...",4
3,Voters in the Golden State passed Proposition ...,0
4,"Ten years ago, dead fish began washing ashore ...",0
...,...,...
3147,The US National Institutes of Health says it i...,0
3148,A resident of an assisted-living facility in I...,2
3149,The risk of dying from COVID-19 increases sign...,1
3150,A freestyle skier completes a practice run ahe...,0


In [8]:
data.cluster.value_counts()

0    1981
1     406
3     314
4     298
2     153
Name: cluster, dtype: int64

# Cluster Interpretations

In [9]:
import nltk
%time data["words"] = data.articles.apply(lambda x: nltk.word_tokenize(x))
data["tagged_words"] = data.words.apply(lambda x: nltk.pos_tag(x))

CPU times: user 17.1 s, sys: 200 ms, total: 17.3 s
Wall time: 17.2 s


In [10]:
from collections import Counter

def get_counter(dataframe, stopwords=[]):
    counter = Counter()
    
    for l in dataframe.tagged_words:
        word_set = set()

        for t in l:
            word = t[0].lower()
            tag = t[1]

            if word not in stopwords:
                word_set.add(word)
            
        counter.update(word_set)
        
    return counter

from nltk.corpus import stopwords
import string

global_stopwords = stopwords.words("english") 
local_stopwords = [c for c in string.punctuation] +\
                  ['‘', '’', '“', '”', '``', '…', '...', "''", "'m", "'re", "'s", "'ve", "n't", "—",
                   'amp', 'http', 'https', 'rt', "also"]

## Cluster 0

In [11]:
# most frequent words in cluster 0
print(len(data[data.cluster==0]))
counter = get_counter(data[data.cluster == 0], global_stopwords+local_stopwords)
counter.most_common(10)

1981


[('says', 1753),
 ('university', 1633),
 ('researchers', 1597),
 ('research', 1511),
 ('could', 1470),
 ('one', 1462),
 ('scientists', 1387),
 ('new', 1369),
 ('would', 1327),
 ('us', 1288)]

### The Research cluster

## Cluster 1

In [12]:
# most frequent words in cluster 1
print(len(data[data.cluster==1]))
counter = get_counter(data[data.cluster == 1], global_stopwords+local_stopwords)
counter.most_common(10)

406


[('people', 389),
 ('says', 377),
 ('researchers', 370),
 ('virus', 355),
 ('university', 354),
 ('one', 350),
 ('covid-19', 346),
 ('could', 345),
 ('new', 331),
 ('health', 323)]

### Covid-19 cluster

## Cluster 2

In [13]:
# most frequent words in cluster 2
print(len(data[data.cluster==2]))
counter = get_counter(data[data.cluster == 2], global_stopwords+local_stopwords)
counter.most_common(10)

153


[('vaccine', 152),
 ('vaccines', 151),
 ('people', 147),
 ('says', 141),
 ('could', 140),
 ('covid-19', 137),
 ('university', 133),
 ('researchers', 132),
 ('one', 130),
 ('two', 126)]

### Smaller covid-19 cluster

## Cluster 3

In [14]:
# most frequent words in cluster 3
print(len(data[data.cluster==3]))
counter = get_counter(data[data.cluster == 3], global_stopwords+local_stopwords)
counter.most_common(10)

314


[('first', 269),
 ('space', 261),
 ('says', 257),
 ('one', 249),
 ('earth', 242),
 ('years', 234),
 ('university', 234),
 ('could', 229),
 ('nasa', 222),
 ('scientists', 217)]

### Space cluster

## Cluster 4

In [15]:
# most frequent words in cluster 4
print(len(data[data.cluster==4]))
counter = get_counter(data[data.cluster == 4], global_stopwords+local_stopwords)
counter.most_common(10)

298


[('episode', 298),
 ('subscribe', 298),
 ('nature', 298),
 ('rss', 297),
 ('favourite', 297),
 ('podcast', 297),
 ('miss', 297),
 ('never', 297),
 ('mp3', 296),
 ('app', 296)]

### Subscription cluster

# Topic Modeling

In [16]:
# Choosing number of topics
num_of_topics = 27

In [17]:
# initializing a model with parameters
from sklearn.decomposition import LatentDirichletAllocation as LDA


lda = LDA(n_components=num_of_topics, random_state=0)     # LDA uses randomness to get a probability distribution
lda

LatentDirichletAllocation(n_components=27, random_state=0)

In [18]:
# fititng the model on the input data
%time lda.fit(X)

CPU times: user 3min 2s, sys: 16min 45s, total: 19min 47s
Wall time: 2min 30s


LatentDirichletAllocation(n_components=27, random_state=0)

## Getting the topic modeling outcome

In [20]:
lda.components_.shape

(27, 58894)

In [21]:
def show_topics(model, feature_names, num_top_words):
    for topic_idx, topic_scores in enumerate(model.components_):
        print(f"*** Topic {topic_idx}:")
        print(" + ".join(["{:.2f} * {}".format(topic_scores[i], feature_names[i]) for i in topic_scores.argsort()[::-1][:num_top_words]]))
        print()

In [22]:
show_topics(lda, vectorizer.get_feature_names_out(), 10)

*** Topic 0:
1.04 * hunga + 0.91 * navajo + 0.87 * megaconstellations + 0.85 * tonga + 0.80 * hetz + 0.79 * starlinks + 0.79 * waltman + 0.79 * starlink + 0.79 * stimulation + 0.77 * rinderpest

*** Topic 1:
3.72 * amyloid + 2.66 * logothetis + 1.90 * mps + 1.57 * comet + 1.45 * eli + 1.37 * jordan + 1.34 * plaques + 1.27 * borisov + 1.19 * oumuamua + 1.15 * aducanumab

*** Topic 2:
5.90 * perseverance + 4.76 * rover + 4.23 * mars + 3.88 * martian + 3.65 * zhurong + 3.18 * jezero + 3.02 * bats + 2.74 * insight + 2.26 * pigs + 2.18 * cnsa

*** Topic 3:
3.14 * erc + 2.75 * hydroxychloroquine + 2.60 * cao + 2.59 * ivermectin + 2.56 * crackdown + 2.50 * gun + 2.35 * fbi + 2.13 * spinal + 2.00 * cord + 1.92 * sudan

*** Topic 4:
1.08 * jacobs + 0.82 * vannoni + 0.82 * neonicotinoids + 0.81 * furin + 0.80 * receptor + 0.78 * bulgaria + 0.75 * rhinolophus + 0.70 * brucella + 0.68 * ace2 + 0.66 * novo

*** Topic 5:
10.41 * stars + 9.45 * methane + 7.64 * quantum + 7.39 * esa + 6.95 * planets +

## Visualization of the Topic Modeling

In [23]:
import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

pyLDAvis.enable_notebook()

In [24]:
pyLDAvis.sklearn.prepare(lda, X, vectorizer)

Limitations/Challenges
- Not enough data with the sentiment classifier so low accuracy results.
- Hard to find optimal number of topics for project without having irrelevant topics.