In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
data = pd.read_csv('arxiv2017.csv', delimiter=';')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131565 entries, 0 to 131564
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   ID            131565 non-null  object
 1   Date          131565 non-null  object
 2   Title         131565 non-null  object
 3   Abstract      131564 non-null  object
 4   Subject_area  130671 non-null  object
dtypes: object(5)
memory usage: 5.0+ MB


In [2]:
data.head()

Unnamed: 0,ID,Date,Title,Abstract,Subject_area
0,0912.5014v1,26/12/2009,A User's Guide to Zot,Zot is an agile and easily extendible bounded ...,LO
1,0910.0820v2,05/10/2009,Prediction of Zoonosis Incidence in Human usin...,Zoonosis refers to the transmission of infecti...,LG
2,1505.01933v1,08/05/2015,Wireless Multicast for Zoomable Video Streaming,Zoomable video streaming refers to a new class...,NI
3,1512.02794v2,09/12/2015,On Computing the Minkowski Difference of Zonot...,Zonotopes are becoming an increasingly popular...,CG
4,cs_0701171v1,26/01/2007,The Zones Algorithm for Finding Points-Near-a-...,Zones index an N-dimensional Euclidian or metr...,DB


In [3]:
# DB, NI, CR, CV
# I want to extract these subject area rows from data and develop the clustering program using their data
# in a controlled environment, evaluating the unsupervised clustering by the known clusters.
selected_subjects = ["DB", "NI", "CR", "CV"]
filtered_data = data[data['Subject_area'].isin(selected_subjects)]


In [None]:
# I was thinking about combining the title and the abstract columns,
# but i think it might be usefull to give more emphasis to the words in the
# title than that of the abstract.

In [None]:
# Preprocess the text
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    text = re.sub(r"http\S+", "", text)  # Remove links
    text = re.sub("[^A-Za-z]+", " ", text)  # Remove special characters and numbers
    if remove_stopwords:
        tokens = nltk.word_tokenize(text)  # Tokenize
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]  # Remove stopwords
        text = " ".join(tokens)  # Join tokens
    text = text.lower().strip()  # Convert to lowercase and remove whitespace
    return text

In [None]:
filtered_data['cleaned'] = filtered_data['Title'].apply(lambda x: preprocess_text(x, remove_stopwords=True))

In [None]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=2, max_df=0.95)

X = vectorizer.transform(filtered_data['cleaned'])

In [None]:
# Initialize KMeans clustering with 3 clusters
kmeans = KMeans(n_clusters=4, random_state=42)
# Fit the model
kmeans.fit(X)
# Store cluster labels in a variable
clusters = kmeans.labels_

In [None]:
# Initialize PCA with 2 components
pca = PCA(n_components=2, random_state=42)
pca_vecs = pca.transform(X.toarray())
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]

In [None]:
filtered_data['cluster'] = clusters
filtered_data['x0'] = x0
filtered_data['x1'] = x1

In [None]:
# cluster_map = {0: "Cluster 0", 1: "Cluster 1", 2: "Cluster 2"}
# filtered_data['cluster'] = filtered_data['cluster'].map(cluster_map)