# The data

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import nltk
import sklearn.cluster as clustering
from sklearn.metrics.cluster import normalized_mutual_info_score as NMI

pd.options.display.float_format = '{:,.2f}'.format

# nltk.download('punkt') # Uncomment if needed when running
# nltk.download('words')

In [3]:
data_path = 'abstractdata5.csv'
original_data = pd.read_csv(data_path, sep='#', names=['id', 'class', 'title' ,'abstract'], index_col='id')
data = original_data.copy()
data['text'] = original_data[['title', 'abstract']].agg(' '.join, axis=1)
snow = nltk.stem.SnowballStemmer('english')
data['cleaned'] = data['text'].map(lambda sentence: ' '.join([snow.stem(w) for w in nltk.word_tokenize(sentence)]))
vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(data['cleaned'])
vectorizer.get_feature_names_out().shape
y_pred = clustering.KMeans(n_clusters=5, random_state=0).fit_predict(tfidf)
NMI(y_pred, data['class'], average_method='geometric')

0.7075916942964255

In [4]:
y_pred = clustering.AffinityPropagation().fit_predict(tfidf.toarray())
NMI(y_pred, data['class'], average_method='geometric')



0.0

In [5]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf = vectorizer.fit_transform(data['cleaned'])
y_pred = clustering.KMeans(n_clusters=5, random_state=0).fit_predict(tfidf)
NMI(y_pred, data['class'], average_method='geometric')

0.7174609625369943

In [6]:
from sklearn import decomposition
reduction = decomposition.PCA(n_components=100).fit_transform(tfidf.toarray())

In [7]:
y_pred = clustering.KMeans(n_clusters=5, random_state=0).fit_predict(reduction)
NMI(y_pred, data['class'], average_method='geometric')

0.7344767423266741

In [8]:
reduction = decomposition.PCA(n_components=150, random_state=0).fit_transform(tfidf.toarray())
y_pred = clustering.KMeans(n_clusters=5, random_state=0).fit_predict(reduction)
NMI(y_pred, data['class'], average_method='geometric')

0.7373157011200171

In [9]:
res = {'spectral':{}, 'kmeans': {}}
#for i in range(5, 300, 5):
#    reduction = decomposition.PCA(n_components=i, random_state=0).fit_transform(tfidf.toarray())
#    y_pred = clustering.KMeans(n_clusters=5, random_state=0).fit_predict(reduction)
#    res['spectral'][i] = NMI(y_pred, data['class'], average_method='geometric')
#    y_pred = clustering.SpectralClustering(n_clusters=5, random_state=0).fit_predict(reduction)
#    res['kmeans'][i] = NMI(y_pred, data['class'], average_method='geometric')
#    print(i, ':', res['kmeans'][i], ':', res['spectral'][i])

In [10]:
reduction = decomposition.PCA(n_components=240, random_state=0).fit_transform(tfidf.toarray())
y_pred = clustering.KMeans(n_clusters=5, random_state=0).fit_predict(reduction)
NMI(y_pred, data['class'], average_method='geometric')

0.7533063134626479

In [11]:
from collections import Counter as count
content = {}
for i in range(0,4):
    content[i] = count(vectorizer.get_feature_names_out()[tfidf.toarray()[y_pred==i].argmax(axis=1)]).most_common()

In [12]:
words = set(nltk.corpus.words.words())
data['english'] = data['text'].map(lambda sentence: 
                                   ' '.join([snow.stem(w) for w in nltk.wordpunct_tokenize(sentence) if w in words]))
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf = vectorizer.fit_transform(data['english'])
y_pred = clustering.KMeans(n_clusters=5, random_state=0).fit_predict(tfidf)
NMI(y_pred, data['class'], average_method='geometric')

0.4680857822460559

In [13]:
data['english'] = data['text'].map(lambda sentence: 
                                   ' '.join([w for w in nltk.word_tokenize(sentence)]))
vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(data['english'])
y_pred = clustering.KMeans(n_clusters=5, random_state=0).fit_predict(tfidf)
NMI(y_pred, data['class'], average_method='geometric')

0.578530660049897

In [14]:
content

{0: [('quantum', 8),
  ('regist', 6),
  ('schedul', 6),
  ('queue', 4),
  ('loop', 4),
  ('teach', 4),
  ('gate', 3),
  ('qualifi', 3),
  ('memori model', 3),
  ('code', 3),
  ('offload', 2),
  ('instruct', 2),
  ('ti', 2),
  ('mpi', 2),
  ('transform rule', 2),
  ('graph color', 2),
  ('simd', 2),
  ('effect', 2),
  ('proof', 2),
  ('spm', 2),
  ('phoa', 2),
  ('concurr', 2),
  ('noc', 2),
  ('circuit', 2),
  ('run tim', 2),
  ('polyhedr', 1),
  ('check', 1),
  ('flow', 1),
  ('sqram', 1),
  ('picco', 1),
  ('cobalt', 1),
  ('rpc', 1),
  ('transform', 1),
  ('hybrid flow', 1),
  ('layout', 1),
  ('parallel', 1),
  ('grammar', 1),
  ('mcds', 1),
  ('ic', 1),
  ('visual comput', 1),
  ('gel', 1),
  ('fsc', 1),
  ('lr', 1),
  ('quantum anneal', 1),
  ('defacto', 1),
  ('decompress', 1),
  ('interv graph', 1),
  ('pim', 1),
  ('focus', 1),
  ('execut', 1),
  ('languag', 1),
  ('emul', 1),
  ('reaction', 1),
  ('secur parti', 1),
  ('obfusc', 1),
  ('cuda', 1),
  ('mpc', 1),
  ('argument',

In [16]:
original_data

Unnamed: 0_level_0,class,title,abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id1,1,Anomaly detection in wide area imagery [Geniş ...,This study is about detecting anomalies in wid...
id2,1,Person re-identification with deep kronecker-p...,Person re-identification (re-ID) aims to robus...
id3,1,Crack detection in images of masonry using cnns,While there is a significant body of research ...
id4,5,Towards an energy efficient code generator for...,Using a smartphone become the part of our ever...
id5,5,Sub-polyhedral scheduling using (Unit-)two-var...,Polyhedral compilation has been successful in ...
...,...,...,...
id1328,1,Colorimetric point-of-care paper-based sensors...,Creatinine is a clinically significant analyte...
id1329,1,Calcium identification and scoring based on ec...,"Currently, an echocardiography expert is neede..."
id1330,1,Considering filter importance and irreplaceabi...,Deep convolutional neural network (CNNs) have ...
id1331,4,Low-complexity bit-serial sequential polynomia...,GF(2m) multiplication is a complex and perform...
