# Finding clusters from topic keywords

This experimental notebook identifies topics by grouping them into clusters from word vectors. The words are extracted from the original texts with NLTK.
See the 

. The idea is that each cluster can represent a certain field of interest.

## Imports

In [None]:
import pandas as pd 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt 
import numpy as np
import scipy
from transformers import AutoTokenizer
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

from sklearn.model_selection import train_test_split

#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'
from tensorflow.python.client import device_lib
 
#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Load the data
Here we load our preprocessed data

We've obtained a .csv file with keywords for each document as target. The last column shows the extracted text of the document, which we will use as input data. Text extraction was made with Google Cloud Services.

In [2]:
df_model=pd.read_csv('../csv/NL_document_targets_deg_to_ratio_filtered.csv',sep=';',index_col=[0])

print (f'Number of encoded columns: {len(df_model.columns)-2}')
encoded_columns=len(df_model.columns)-2

Number of encoded columns: 86


Let's take a look at the document Nr., the extracted keywords, and the extracted text

In [3]:
df_model.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,78,79,80,81,82,83,84,85,86,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,10202-2018-012766,ploegenarbeid,overuren,eindejaarspremie,tijdskrediet,feestdag,bediendenstelsel,ongeval,commercenachtarbeid,minimumuur,...,,,,,,,,,,sous-commission paritaire de l'industrie des c...
1,10202-2020-013175,werkgelegenheidsmaatregelen,ploegenarbeid,overuren,eindejaarspremie,tijdskrediet,feestdag,bediendenstelsel,ongeval,minimumuur,...,,,,,,,,,,service public fédéral emploi travail et conce...
2,10205-2018-004963,werkgelegenheidsmaatregelen,eenmalige premie,ploegenarbeid,eindejaarspremie,tijdskrediet,feestdag,bediendenstelsel,ongeval,commercenachtarbeid,...,,,,,,,,,,- service public fédéral emploi travail et con...
3,10206-2019-003872,groepsverzekeringen,vergoedingen,actieve werknemer,alle premies,aanvullende pensioenen,,,,,...,,,,,,,,,,begrippenlijst algemeen artikel doel en werkin...
4,10206-2020-000814,ploegenarbeid,werk,eindejaarspremie,tijdskrediet,feestdag,jonge werknemers,studentenlonen,ongeval,commercenachtarbeid,...,,,,,,,,,,paritair subcomité voor het bedrijf der grind-...


## Determine clusters

We will transform the first (n)=10 columns with get_dummies, and removing any duplicates.
We print out these columns for the 5528 documents.

In [4]:
y_1 = pd.get_dummies(df_model['1'])
y_2 = pd.get_dummies(df_model['2'])
y_3 = pd.get_dummies(df_model['3'])
y_4 = pd.get_dummies(df_model['4'])
y_5 = pd.get_dummies(df_model['5'])
y_6 = pd.get_dummies(df_model['6'])
y_7 = pd.get_dummies(df_model['7'])
y_8 = pd.get_dummies(df_model['8'])
y_9 = pd.get_dummies(df_model['9'])
y_10 = pd.get_dummies(df_model['10'])

y=pd.concat([y_1,y_2,y_3,y_4,y_5,y_6,y_7,y_8,y_9,y_10],axis=1,join='inner') 

y = y.groupby(level=0,axis=1).sum()
y_unique = y.loc[:,~y.columns.duplicated()].copy() 

y=y_unique 
y.head(-3)


Unnamed: 0_level_0,aanvullende pensioenen,aanwerving,actieve werknemer,administratieve,afscheidspremie,alle premies,alternatief voordeel,anciënniteitspremie,anciënniteitsverlof,arab,...,werk,werkgelegenheidsmaatregelen,werking onderneming,werking paritair comité,werkkledij,werkloosheid,werknemer,wijziging ressort pc,winstdeelneming,ziekte
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5523,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5524,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Optimal k-value

With Kmeans we can determine the optimal number of clusters

In [5]:
def ideal_n_clusters():
    Sum_of_squared_distances = []
    K = range(1,100)
    for k in K:
        km = KMeans(n_clusters=k)
        km = km.fit(y)
        Sum_of_squared_distances.append(km.inertia_)
        
    plt.plot(K, Sum_of_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

ideal_n_clusters()

Apply kmeans and write the clusters to the dataframe

In [6]:
kmeans = KMeans(n_clusters=100)
kmeans.fit(y)

#Find which cluster each data-point belongs to
clusters = kmeans.predict(y)

#Add the cluster vector to our DataFrame, y
y["cluster"] = clusters
df_model["cluster"] = clusters
 
#plotX is a DataFrame containing 5000 values sampled randomly from X
plot_y = pd.DataFrame(np.array(y))

#Rename plotX's columns since it was briefly converted to an np.array above
plot_y.columns = y.columns

In [7]:
y.head()

Unnamed: 0_level_0,aanvullende pensioenen,aanwerving,actieve werknemer,administratieve,afscheidspremie,alle premies,alternatief voordeel,anciënniteitspremie,anciënniteitsverlof,arab,...,werkgelegenheidsmaatregelen,werking onderneming,werking paritair comité,werkkledij,werkloosheid,werknemer,wijziging ressort pc,winstdeelneming,ziekte,cluster
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,67
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,67
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,67
3,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,87
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,67


## Principal Component Analysis

In [9]:
#PCA with one principal component
pca_1d = PCA(n_components=1)

#PCA with two principal components
pca_2d = PCA(n_components=2)

#PCA with three principal components
pca_3d = PCA(n_components=3)

#PCA with 4 principal components
pca_4d = PCA(n_components=4)

#PCA with 5 principal components
pca_5d = PCA(n_components=5)

### Plotting the clusters in 2-D
This code will show the clusters in a visual plot 
 
This code has been copied from https://www.kaggle.com/code/minc33/visualizing-high-dimensional-clusters/notebook

In [10]:
#This DataFrame holds that single principal component mentioned above
PCs_1d = pd.DataFrame(pca_1d.fit_transform(plot_y.drop(["cluster"], axis=1)))

#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plot_y.drop(["cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plot_y.drop(["cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 4-D
PCs_4d = pd.DataFrame(pca_4d.fit_transform(plot_y.drop(["cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 4-D
PCs_5d = pd.DataFrame(pca_5d.fit_transform(plot_y.drop(["cluster"], axis=1)))

#Rename the columns of these newly created DataFrames:
PCs_1d.columns = ["PC1_1d"]

#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
PCs_2d.columns = ["PC1_2d", "PC2_2d"]

PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
PCs_4d.columns = ["PC1_4d", "PC2_4d", "PC3_4d", "PC4_4d"]
PCs_5d.columns = ["PC1_4d", "PC2_4d", "PC3_4d", "PC4_4d", "PC5_5d"]

#plot_y = pd.concat([plot_y,PCs_1d,PCs_2d,PCs_3d], axis=1, join='inner')
#plot_y = pd.concat([plot_y,PCs_1d,PCs_2d,PCs_3d,PCs_4d], axis=1, join='inner')
plot_y = pd.concat([plot_y,PCs_1d,PCs_2d,PCs_3d,PCs_4d,PCs_5d], axis=1, join='inner')

# create one new column for plotX so that we can use it for 1-D visualization.
plot_y["dummy"] = 0

#Note that all of the DataFrames below are sub-DataFrames of 'plotX'.
#This is because we intend to plot the values contained within each of these DataFrames.

cluster0 = plot_y[plot_y["cluster"] == 0]
cluster1 = plot_y[plot_y["cluster"] == 1]
cluster2 = plot_y[plot_y["cluster"] == 2]
cluster3 = plot_y[plot_y["cluster"] == 3]
cluster4 = plot_y[plot_y["cluster"] == 4]

#This is needed so we can display plotly plots properly
init_notebook_mode(connected=True)

In [11]:

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_2d"],
                    y = cluster2["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

### Plotting the clusters in 3-D

In [12]:
#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["PC1_3d"],
                    y = cluster2["PC2_3d"],
                    z = cluster2["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 3'
trace4 = go.Scatter3d(
                    x = cluster3["PC1_3d"],
                    y = cluster3["PC2_3d"],
                    z = cluster3["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(128, 10, 200, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 4'
trace5 = go.Scatter3d(
                    x = cluster4["PC1_3d"],
                    y = cluster4["PC2_3d"],
                    z = cluster4["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 4",
                    marker = dict(color = 'rgba(2, 128, 0, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3,trace4,trace5]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)