In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install bertopic
!pip install datasets
!pip install -U plotly==5.3.1
!pip install -U kaleido
!pip install matplotlib
!pip install -U numpy==1.23.5

In [3]:
import pandas as pd
from bertopic import BERTopic
import os
from os.path import join as opj
import numpy as np
import pandas as pd
import time

In [4]:
news_df = pd.read_csv("/content/drive/MyDrive/ML_project/new_text_withstem.csv")
# Extract

cleaned_text = news_df['headline_cleaned_text']
original_text = news_df['headline_text']
cleaned_text = [cleaned_text[i] if isinstance(cleaned_text[i],str) else original_text[i] for i in range(len(cleaned_text))]

num_samples = 100000
data = cleaned_text[:num_samples]

# （1）get model

In [6]:
from sentence_transformers import SentenceTransformer

small_sentence_transformer_emb =  SentenceTransformer("paraphrase-albert-small-v2")

embeddings_model = small_sentence_transformer_emb

# (2) get embeddings for centers

> 添加区块引用符号



In [7]:

class_labels = ["Worlds", "Sports", "Business", "Science and Technich"]
class_sentences = [f"A news in the topic of {c}" for c in class_labels]

class_embeddings_label = embeddings_model.encode(class_labels)
class_embeddings_sentence = embeddings_model.encode(class_sentences)

In [79]:
class_embeddings_sentence[0,:10]

array([ 0.28954062,  0.1546708 ,  0.39693442, -0.55586064,  0.39332297,
       -0.03232314,  0.21324575,  0.01197499, -0.33840042,  0.3108279 ],
      dtype=float32)

In [80]:
class_embeddings_label[0,:10]

array([ 1.0392567 ,  0.55880183,  0.22931507, -0.08954945, -0.1234863 ,
        0.758897  ,  1.1228831 , -0.6514519 , -0.2523554 ,  0.72553414],
      dtype=float32)

(3) get data embeddings

In [84]:

def classification_by_distance(class_embeddings, embedding):
  min_index = 0
  min_distance = 100
  for i in range(len(class_embeddings)):
    distance = np.linalg.norm(class_embeddings[i] - embedding)
    if distance < min_distance:
      min_distance = distance
      min_index = i
  return min_index + 1

def predict_csv(input_path, class_embeddings, output_path):
  os.makedirs(os.path.dirname(output_path), exist_ok = True)

  # class_embeddings = class_embeddings_label
  df = pd.read_csv(input_path)
  datas = []
  for i in range(len(df)):
    index = df.loc[i,"index"]
    datas.append(original_text[index])
  embeddings = embeddings_model.encode(datas)

  preds = [classification_by_distance(class_embeddings, embedding) for embedding in embeddings]

  import csv
  with open(output_path, "w+") as csv_file:
    writer = csv.writer(csv_file)
    headers = ["index", "data", "label","pred"]
    output_list = []
    for i in range(len(df)):
      output_list.append(df.loc[i,:].tolist() + [preds[i]])
    writer.writerow(headers)
    writer.writerows(output_list)


In [None]:


input_path = "/content/drive/MyDrive/ML_project/input/test_dataset_no_zero.csv"
output_root = "/content/drive/MyDrive/ML_project/output/classification"

round_name = "word_distance"
class_embeddings = class_embeddings_label
output_path = opj(output_root,f"{round_name}.csv")
predict_csv(input_path, class_embeddings, output_path)

# evaluation

In [43]:
from sklearn.metrics import classification_report

def get_classification_metrics(csv_file):
  df = pd.read_csv(csv_file)
  print(df)
  labels = df.label
  preds = df.pred

  return classification_report(labels, preds)

In [44]:
label_csv_file = "/content/drive/MyDrive/ML_project/output/classification/word_distance.csv"
report_label = get_classification_metrics(label_csv_file)
print(report_label)

     index                                               data  label  pred
0    80879                  downer defends iraq troop numbers      1     4
1    92777               cyclists set for high powered worlds      2     1
2    78245  police officer charged with kidnap sexual assault      1     4
3    73654               cracked tanker enters sydney harbour      1     4
4    80161           court refuses dna bid over moran killing      1     4
..     ...                                                ...    ...   ...
595  10311        knights gain momentum with key players back      2     2
596   8727  more must be done to reduce japanese beef tariffs      3     4
597  35811   mcgregor nyangelo take city to surf line honours      2     4
598  65216                    aru in camp wallaby discussions      2     4
599  34218  fears of power blackouts in wake of utility re...      3     4

[600 rows x 4 columns]
              precision    recall  f1-score   support

           1       0.

sentence

In [85]:
input_path = "/content/drive/MyDrive/ML_project/input/test_dataset_no_zero.csv"
output_root = "/content/drive/MyDrive/ML_project/output/classification"

round_name = "sentence_distance"
class_embeddings = class_embeddings_sentence
output_path = opj(output_root,f"{round_name}.csv")
predict_csv(input_path, class_embeddings, output_path)

In [86]:
label_csv_file = "/content/drive/MyDrive/ML_project/output/classification/sentence_distance.csv"
report_label = get_classification_metrics(label_csv_file)
print(report_label)

     index                                               data  label  pred
0    80879                  downer defends iraq troop numbers      1     4
1    92777               cyclists set for high powered worlds      2     4
2    78245  police officer charged with kidnap sexual assault      1     4
3    73654               cracked tanker enters sydney harbour      1     4
4    80161           court refuses dna bid over moran killing      1     4
..     ...                                                ...    ...   ...
595  10311        knights gain momentum with key players back      2     2
596   8727  more must be done to reduce japanese beef tariffs      3     4
597  35811   mcgregor nyangelo take city to surf line honours      2     4
598  65216                    aru in camp wallaby discussions      2     4
599  34218  fears of power blackouts in wake of utility re...      3     4

[600 rows x 4 columns]
              precision    recall  f1-score   support

           1       0.

# Topic methods

In [55]:
from sentence_transformers import SentenceTransformer
small_sentence_transformer_emb =  SentenceTransformer("paraphrase-albert-small-v2")

from umap import UMAP

umap_dr = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

from sklearn.cluster import KMeans
k_means_model =KMeans(n_clusters = 8)

from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer_vectorizer = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

from bertopic.representation import KeyBERTInspired

keybert_representation = KeyBERTInspired()

representation_model = {
     KeyBERTInspired()
}


topic_model = BERTopic(
  # Pipeline models
  embedding_model=small_sentence_transformer_emb,
  umap_model=umap_dr,
  hdbscan_model=k_means_model,
  vectorizer_model=CountVectorizer_vectorizer,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=30,
  # nr_topics = nr_topics,
  verbose=True
)




In [56]:

embeddings = small_sentence_transformer_emb.encode(data, show_progress_bar=True)

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [57]:
reduced_embeddings = umap_dr.fit_transform(embeddings)

In [None]:
topics, probs = topic_model.fit_transform(data, embeddings)

In [62]:
topics_distr, _ = topic_model.approximate_distribution(data, window=8, stride=4)

100%|██████████| 100/100 [00:02<00:00, 39.43it/s]


In [70]:
topics_distr

1.0

In [None]:
# output_root = "/content/drive/MyDrive/ML_project/output"
# topic_representation_df = topic_model.get_topic_info()
# topic_representation_path = opj(output_root, "8_small_topic_representations.csv")
# topic_representation_df.to_csv(topic_representation_path, index = False)

In [75]:
new_topics_distr = np.load("/content/drive/MyDrive/ML_project/input/topics_distr.npy")
new_topics_distr[:10,:]

array([[0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:

def classification_by_topic(, embedding):
  min_index = 0
  min_distance = 100
  for i in range(len(class_embeddings)):
    distance = np.linalg.norm(class_embeddings[i] - embedding)
    if distance < min_distance:
      min_distance = distance
      min_index = i
  return min_index + 1


