In [26]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA, TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

### tf-idf word-frequency array

In [2]:
documents = ['cats say meow', 'dogs say woof', 'dogs chase cats']

In [3]:
tfidf = TfidfVectorizer()
csr_mat = tfidf.fit_transform(documents)

In [4]:
csr_mat.toarray()

array([[0.51785612, 0.        , 0.        , 0.68091856, 0.51785612,
        0.        ],
       [0.        , 0.        , 0.51785612, 0.        , 0.51785612,
        0.68091856],
       [0.51785612, 0.68091856, 0.51785612, 0.        , 0.        ,
        0.        ]])

In [5]:
words = tfidf.get_feature_names_out()
words

array(['cats', 'chase', 'dogs', 'meow', 'say', 'woof'], dtype=object)

### Clustering Wikipedia

In [18]:
df = pd.read_csv('data/Wikipedia articles/wikipedia-vectors.csv', index_col=0)
df.head()

Unnamed: 0,HTTP 404,Alexa Internet,Internet Explorer,HTTP cookie,Google Search,Tumblr,Hypertext Transfer Protocol,Social search,Firefox,LinkedIn,...,Chad Kroeger,Nate Ruess,The Wanted,Stevie Nicks,Arctic Monkeys,Black Sabbath,Skrillex,Red Hot Chili Peppers,Sepsis,Adam Levine
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.008878,0.0,0.0,0.049502,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00611,0.0
2,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005646,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
articles = csr_matrix(df.transpose())
titles = list(df.columns)

In [21]:
svd = TruncatedSVD(n_components=50)
kmeans = KMeans(n_clusters=6)
pipeline = make_pipeline(svd, kmeans)

In [22]:
pipeline.fit(articles)
labels = pipeline.predict(articles)

In [24]:
df = pd.DataFrame({'label': labels, 'article': titles})
df.sort_values('label')

Unnamed: 0,label,article
29,0,Jennifer Aniston
28,0,Anne Hathaway
27,0,Dakota Fanning
26,0,Mila Kunis
25,0,Russell Crowe
24,0,Jessica Biel
23,0,Catherine Zeta-Jones
22,0,Denzel Washington
21,0,Michael Fassbender
20,0,Angelina Jolie


### Non-negative Matrix Factorisation (NMF)

In [33]:
nmf = NMF(n_components=6)
nmf.fit(articles)

In [34]:
nmf_features = nmf.transform(articles)
nmf_features

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.38647313e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 5.64274107e-01],
       [3.77627512e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.97000162e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.80163034e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.83525730e-01],
       [1.27819769e-02, 1.37156633e-02, 7.72535076e-03, 3.32158257e-02,
        0.00000000e+00, 3.33148104e-01],
       [0.00000000e+00, 0.00000000e+00, 2.05699585e-02, 0.00000000e+00,
        6.01702614e-03, 3.57576405e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 4.88963536e-01],
       [1.52487959e-02, 1.42027268e-02, 3.74781273e-03, 2.35514569e-02,
        2.61387823e-02, 

In [36]:
df = pd.DataFrame(nmf_features, index=titles)
df.head()

Unnamed: 0,0,1,2,3,4,5
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.438647
Alexa Internet,0.0,0.0,0.0,0.0,0.0,0.564274
Internet Explorer,0.003776,0.0,0.0,0.0,0.0,0.397
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.380163
Google Search,0.0,0.0,0.0,0.0,0.0,0.483526


In [37]:
df.loc['Anne Hathaway']

0    0.003814
1    0.000000
2    0.000000
3    0.571847
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64

In [38]:
df.loc['Denzel Washington']

0    0.000000
1    0.005575
2    0.000000
3    0.419550
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64

In [39]:
nmf.components_

array([[1.15103724e-02, 1.22401011e-03, 0.00000000e+00, ...,
        0.00000000e+00, 4.28410483e-04, 0.00000000e+00],
       [0.00000000e+00, 9.60843942e-06, 5.69507954e-03, ...,
        2.82862278e-03, 2.99007514e-04, 0.00000000e+00],
       [0.00000000e+00, 8.34978031e-06, 0.00000000e+00, ...,
        0.00000000e+00, 1.43972767e-04, 0.00000000e+00],
       [4.17542601e-03, 0.00000000e+00, 3.07658046e-03, ...,
        1.75367828e-03, 6.76509259e-03, 0.00000000e+00],
       [0.00000000e+00, 5.71088566e-04, 4.94131443e-03, ...,
        1.92554271e-04, 1.35792696e-03, 0.00000000e+00],
       [1.38892753e-04, 0.00000000e+00, 8.78468485e-03, ...,
        2.41071591e-03, 1.68926395e-03, 0.00000000e+00]])

In [61]:
with open('data/Wikipedia articles/wikipedia-vocabulary-utf8.txt', 'r') as file1:
    words = file1.readlines()

In [62]:
words

['aaron\n',
 'abandon\n',
 'abandoned\n',
 'abandoning\n',
 'abandonment\n',
 'abbas\n',
 'abbey\n',
 'abbreviated\n',
 'abbreviation\n',
 'abc\n',
 'abdomen\n',
 'abdominal\n',
 'abdul\n',
 'abel\n',
 'abilities\n',
 'ability\n',
 'able\n',
 'abnormal\n',
 'abnormalities\n',
 'abnormally\n',
 'aboard\n',
 'abolish\n',
 'abolished\n',
 'abolishing\n',
 'abolition\n',
 'aboriginal\n',
 'abortion\n',
 'abraham\n',
 'abroad\n',
 'abrupt\n',
 'abruptly\n',
 'absence\n',
 'absent\n',
 'absolute\n',
 'absolutely\n',
 'absorb\n',
 'absorbed\n',
 'absorbing\n',
 'absorbs\n',
 'absorption\n',
 'abstract\n',
 'abstraction\n',
 'absurd\n',
 'abu\n',
 'abundance\n',
 'abundant\n',
 'abuse\n',
 'abused\n',
 'abuses\n',
 'abusive\n',
 'academia\n',
 'academic\n',
 'academics\n',
 'academies\n',
 'academy\n',
 'accelerate\n',
 'accelerated\n',
 'accelerating\n',
 'acceleration\n',
 'accent\n',
 'accents\n',
 'accept\n',
 'acceptable\n',
 'acceptance\n',
 'accepted\n',
 'accepting\n',
 'accepts\n',
 '