# Cài đặt môi trường

In [0]:
# Để tách từ, dùng 1 trong 2 thư viện pyvi, hoặc underthesea
# Cài underthesea
!pip install underthesea

# Cài pyvi
!pip install pyvi

In [0]:
# Xin quyền ghi Google drive (ghi model)
print("\n\nXin quyền ghi Google drive (ghi model)")
from google.colab import drive
drive.mount('/content/drive')

BASE_FOLDER_NAME = "/content/drive/My Drive/Research/NLP/NLP Journalism/RESULT/"
DATA_FOLDER_NAME = BASE_FOLDER_NAME + "CSV files/"
CLASSIFICATION_FOLDER_NAME = BASE_FOLDER_NAME

In [0]:
import numpy as np
from scipy import spatial
import pandas as pd
import csv

from ast import literal_eval
from underthesea import sent_tokenize
from underthesea import word_tokenize
from pyvi import ViTokenizer

import enum
import string
import time

from sklearn.cluster import KMeans


class TOKENIZER(enum.Enum): 
    underthesea = 1
    pyvi = 2

def tokenize(sentence, type = TOKENIZER.pyvi):
  vec = []
  if type == TOKENIZER.underthesea: # sử dụng thư viện tách từ underthesea
    vec = word_tokenize(sentence)
    vec = [word.replace(" ","_").lower() for word in vec]
  if type == TOKENIZER.pyvi: # sử dụng thư viện tách từ pyvi
    vec = ViTokenizer.tokenize(sentence)
    vec = [word.lower() for word in vec.split(" ")]
  return vec

class SentenceClassification:
  def __init__(self, sentences, tokenized_sentences, word_model, stop_words, remove_stop_words = True):
    self.sentences = sentences
    self.tokenized_sentences = tokenized_sentences
    self.word_model = word_model
    self.stop_words = stop_words
    self.remove_stop_words = remove_stop_words

    self.index2word_set = set(self.word_model.wv.index2word)
    self.calculate_feature_vectors()

  def avg_feature_vector(self, tokenized_sentence):
    feature_vec = np.zeros((self.word_model.vector_size, ), dtype='float32')
    n_words = 0
    for word in tokenized_sentence:
      if (not self.remove_stop_words) or (self.remove_stop_words and word not in self.stop_words):
        if word in self.index2word_set:
          n_words += 1
          feature_vec = np.add(feature_vec, self.word_model[word])
    if (n_words > 0):
      feature_vec = np.divide(feature_vec, n_words)
    feature_vec = np.append(feature_vec, n_words) # Thêm chiều số từ
    return feature_vec

  def calculate_feature_vectors(self):
    start_time = time.time()
    self.feature_vectors = np.zeros(shape=(len(self.tokenized_sentences), self.word_model.vector_size + 1)) # Cộng 1, vì đã thêm chiều số từ
    for i in range(len(self.tokenized_sentences)):
      self.feature_vectors[i] = self.avg_feature_vector(self.tokenized_sentences[i])
    print("--- Calculating features takes %s seconds ---" % (time.time() - start_time))
    return self.feature_vectors
  
  def sim(self, s1, s2):
    s1_afv = self.avg_feature_vector(s1)
    s2_afv = self.avg_feature_vector(s2)
    sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
    return sim
  
  def classify(self, level=[100]):
    keys = [] # Mảng các tập keys cấp 1, 2, 3, ... Sẽ có dạng: [["0", "1", ...], ["0-0", "0-1", ... "1-0", "1-1", ...], ["0-0-0", "0-0-1", ...]]
    
    clusters = {} # Lưu kết quả phân cụm các câu đầu vào
    clusters1 = self._classify(range(len(self.sentences)), level[0]) # Phân cụm lần 1

    keysLevel1 = [] # Tập key cấp 1, sẽ có dạng: ["0", "1", "2", ...]
    for i in range(len(clusters1)):
      key = str(i)
      keysLevel1.append(key)
      clusters[key] = clusters1[i] # Lưu lại nhóm câu cấp 1: clusters["0"], cluster["1"], ...
    keys.append(keysLevel1) # Lưu tập key cấp 1 vào mảng tập keys

    for i in range(1, len(level)):
      keysLeveli = [] # Tập key cấp i
      for previousKey in keys[i-1]:
        print(previousKey)
        indexes = clusters[previousKey] # Tập index của các câu cần phân cụm tiếp (index tính theo mảng câu ban đầu)
        if len(indexes) == 0: # Nếu tập các câu cần phân cụm tiếp là tập rỗng, bỏ qua
          pass
        else:
          clustersi = self._classify(indexes, level[i]) # Phân cụm câu
          for j in range(len(clustersi)):
            key = previousKey + "-" + str(j) # key cấp i được phát triển từ key cấp i-1
            keysLeveli.append(key) 
            clusters[key] = clustersi[j] # Lưu lại nhóm câu cấp i, theo tập key cấp i
      keys.append(keysLeveli) # Lưu tập key cấp i vào tập keys
    
    clusters["keys"] = keys
    self.clusters = clusters
    return clusters


  def _classify(self, indexes, n_clusters = -1):
    # Step 1: Tổng hợp features
    if len(indexes) == len(self.sentences):
      X = self.feature_vectors
    else:
      X = np.zeros(shape=(len(indexes), self.word_model.vector_size + 1)) # Cộng 1, vì đã thêm chiều số từ
      for i in range(len(indexes)):
        X[i] = self.feature_vectors[indexes[i]]

    n_clusters = min(len(indexes), n_clusters)

    # Step 2: Clustering, using KMeans
    start_time = time.time()
    if n_clusters > 0:
      kmeans = KMeans(n_clusters = n_clusters, random_state=0).fit(X)
    else:
      kmeans = KMeans(random_state=0).fit(X)
    print("--- Kmeans takes %s seconds ---" % (time.time() - start_time))
    
    # Step 3: Build up returned result
    n_clusters= kmeans.n_clusters
    pred_label = kmeans.predict(X)

    clusters = []
    for i in range(0, n_clusters):
      clusters.append([])
    for i in range(0, pred_label.size):
      clusters[pred_label[i]].append(indexes[i])
    
    return clusters

class ModelTrain:
  def __init__(self, tokenized_sentences, file_path_to_save_model = ""):
    self.tokenized_sentences = tokenized_sentences
    self.file_path_to_save_model = file_path_to_save_model
  
  def train(self):
    print('\nTraining word2vec...')
    start_time = time.time()
    word_model = Word2Vec(self.tokenized_sentences, size=200, min_count=1, window=5, iter=150)
    pretrained_weights = word_model.wv.syn0
    vocab_size, emdedding_size = pretrained_weights.shape
    print('Result embedding shape:', pretrained_weights.shape)
    if self.file_path_to_save_model != "":
      word_model.wv.save_word2vec_format(self.file_path_to_save_model, binary = True)
    print("--- %s seconds ---" % (time.time() - start_time))
    return word_model

# Prepare sentences

In [12]:
SAVED_SENTENCES_FILE_PATH = CLASSIFICATION_FOLDER_NAME + "00Vnexpress.Sentences.txt"
SAVED_TOKENIZED_SENTENCES_FILE_PATH = CLASSIFICATION_FOLDER_NAME + "00Vnexpress.Sentences.Tokenized.txt"
FILE_NAME = "Vnexpress.Articles.CSV"

def prepareSentences():
  df = pd.read_csv(DATA_FOLDER_NAME + FILE_NAME, encoding='utf-8', sep=',', index_col=0)

  # Loại bỏ lỗi duplicate bài viết (link khác nhau, nội dung, thời gian giống nhau)
  print("Kích thước trước khi xử lý duplicate: ", df.shape)
  df.drop_duplicates(subset=["title", "time"], keep = "first", inplace = True)
  print("Kích thước sau khi xử lý duplicate: ", df.shape)

  # df = df.iloc[:1000]

  print(df.shape)

  global all_sentences
  all_sentences = []
  global all_tokenized_sentences
  all_tokenized_sentences = []
  start_time = time.time()
  for index, row in df.iterrows():
      # print(row["link"])
      try:
        article = row['content']
        paragraphs = literal_eval(article)
        translator=str.maketrans('','',string.punctuation + "0123456789")
        for paragraph in paragraphs:
            sentences = sent_tokenize(paragraph)
            for sentence in sentences:
              tmp = sentence.translate(translator)
              if tmp != "":
                all_sentences.append(sentence)
                vec = tokenize(tmp)
                all_tokenized_sentences.append(vec)
                all
      except Exception as ex:
        print("\nERROR with %s. Reason: %s" %(paragraphs, str(ex)))
  print("--- %s seconds ---" % (time.time() - start_time))


import pickle
def saveSentences():
  with open(SAVED_SENTENCES_FILE_PATH, "wb") as fp: # Pickling
    pickle.dump(all_sentences, fp)
  with open(SAVED_TOKENIZED_SENTENCES_FILE_PATH, "wb") as fp: # Pickling
    pickle.dump(all_tokenized_sentences, fp)
def load_sentences(path):
  with open(path, "rb") as fp: # Unpickling
    sentences = pickle.load(fp)
  print('Num sentences:', len(sentences))
  return sentences


prepareSentences()
saveSentences()
# all_sentences = load_sentences(SAVED_SENTENCES_FILE_PATH)
# all_tokenized_sentences = load_sentences(SAVED_TOKENIZED_SENTENCES_FILE_PATH)

Kích thước trước khi xử lý duplicate:  (16100, 6)
Kích thước sau khi xử lý duplicate:  (12544, 6)
(1000, 6)
--- 3.76168155670166 seconds ---


# Sentences filtering

### Vn-index

In [0]:
import re

for i in range(len(all_sentences) - 1, -1, -1):
  if not re.search("Vn-index", all_sentences[i], re.IGNORECASE):
    del all_sentences[i]
    del all_tokenized_sentences[i]

print(len(all_sentences))

## Thanh khoản

In [0]:
import re

for i in range(len(all_sentences) - 1, -1, -1):
  if not re.search("Thanh khoản", all_sentences[i], re.IGNORECASE):
    del all_sentences[i]
    del all_tokenized_sentences[i]

print(len(all_sentences))

## Mã cổ phiếu

In [0]:
import re

for i in range(len(all_sentences) - 1, -1, -1):
  # Chứa 3 chữ hoa liên tiếp (độc lập).
  # Ngoại lệ: HCM, HNX, USD
  if not re.search("(\s[A-Z]{3}\s)|(\s[A-Z]{3}[^\w])|(\s[A-Z]{3}$)|(^[A-Z]{3}$)", 
                   all_sentences[i].
                   replace("HCM", "").
                   replace("USD", "").
                   replace("HNX", "")):
    del all_sentences[i]
    del all_tokenized_sentences[i]

print(len(all_sentences))
for i in range (100):
  print(all_sentences[i])

# Prepare word embedding model, stop word list

In [14]:
WORD_MODEL = CLASSIFICATION_FOLDER_NAME + "01Vnepxress.stock.model.bin"
STOP_WORD_FILE = CLASSIFICATION_FOLDER_NAME + 'vietnamese-stopwords.txt'

from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

my_word_model = ModelTrain(all_tokenized_sentences, file_path_to_save_model=WORD_MODEL).train()
# my_word_model = KeyedVectors.load_word2vec_format(WORD_MODEL, binary=True)

stop_words = [word.strip().replace(" ", "_").lower() for word in open(STOP_WORD_FILE , 'r').readlines()]


Training word2vec...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Result embedding shape: (5678, 200)
--- 21.97386360168457 seconds ---


# Thực thi - Execution

In [0]:
a = SentenceClassification(all_sentences, all_tokenized_sentences, my_word_model, stop_words, remove_stop_words=False)

start_time = time.time()
level = [100, 8]
clusters = a.classify(level)
print("--- %s seconds --- IN TOTAL" % (time.time() - start_time))

f = open(CLASSIFICATION_FOLDER_NAME + "02Vnexpress.Sentences.Classified3.txt", "w+")
for key in clusters["keys"][len(level) - 1]:
  f.write("\n\nNHÓM %s\n" % key)
  for i in clusters[key]:
    f.write("%d: %s\n" % (i, all_sentences[i]))
f.close()