In [None]:
!pip install sentence-transformers
!pip install pandas
!pip install scikit-learn
!pip install matplotlib

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import json
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [2]:
# load the data
with open('./VL_span_extraction.json', 'r') as f:
    data = json.load(f)

data = data['data']

In [6]:
# extract the labels and content

label_list = []
content_list = []

for i in range(len(data)):
    label = data[i]["doc_class"]["code"]
    content = data[i]["paragraphs"][0]["context"]
    label_list.append(label)
    content_list.append(content)


df = pd.DataFrame(list(zip(label_list, content_list)), columns =['label', 'content'])

In [None]:
# remove the rows with label "기타"
df = df[df["label"] != "기타"]
df.reset_index(drop=True, inplace=True)
df

In [None]:
# check the distribution of the labels
df["label"].value_counts()

In [None]:
# sample 50 rows for each label
sampled_df = df.groupby('label').apply(lambda x: x.sample(n=50, random_state=42)).reset_index(drop=True)
sampled_df

In [18]:
# create label mapping dictionary
label_mapping = {
    'IT과학': 'IT_Science', 
    '경제': 'Economy',
    '국제': 'International',
    '문화': 'Culture', 
    '사회': 'Society',
    '스포츠': 'Sports',
    '정치': 'Politics',
    '지역': 'Local'
}

# map Korean labels to English
sampled_df['label'] = sampled_df['label'].map(label_mapping)


In [None]:
sampled_df["label"].value_counts()

In [None]:
# encode the content with model

model = SentenceTransformer("nlpai-lab/KoE5", device='cpu')
embeddings = model.encode(sampled_df["content"].tolist(), show_progress_bar=True)



In [None]:
# check the shape of the embeddings
# (# of samples, embedding dimension)
embeddings.shape

In [None]:
# k-means clustering

# encode labels for ground truth label comparison
le = LabelEncoder()
label_encoded = le.fit_transform(sampled_df["label"])

# perform k-means clustering
n_clusters = len(sampled_df["label"].unique())  # number of unique labels or custom number
# n_clusters = 8 -> can set custom number of clusters

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

In [None]:
# reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=50)
embeddings_2d = tsne.fit_transform(embeddings)

In [None]:
# scatterplot with ground truth labels

# create scatter plot
plt.figure(figsize=(12, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=label_encoded, cmap='tab10')
plt.title('News Articles Clustering')
plt.xlabel('t-SNE dimension 1')
plt.ylabel('t-SNE dimension 2')

# add legend
legend1 = plt.legend(scatter.legend_elements()[0], le.classes_, title="Categories")
plt.gca().add_artist(legend1)

plt.show()


In [None]:
# scatterplot with kmeans clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=clusters, cmap='tab10')
plt.title('News Articles Clustering (K-means)')
plt.xlabel('t-SNE dimension 1')
plt.ylabel('t-SNE dimension 2')

plt.show()
