In [1]:
%pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.3.1-cp310-cp310-win_amd64.whl (24.0 MB)
     ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/24.0 MB 1.3 MB/s eta 0:00:19
     ---------------------------------------- 0.1/24.0 MB 1.3 MB/s eta 0:00:19
     ---------------------------------------- 0.2/24.0 MB 1.6 MB/s eta 0:00:16
      --------------------------------------- 0.3/24.0 MB 1.8 MB/s eta 0:00:14
      --------------------------------------- 0.4/24.0 MB 1.8 MB/s eta 0:00:14
     - -------------------------------------- 0.7/24.0 MB 2.4 MB/s eta 0:00:10
     - -------------------------------------- 0.9/24.0 MB 3.0 MB/s eta 0:00:08
     - -------------------------------------- 0.9/24.0 MB 3.0 MB/s eta 0:00:08
     - -------------------------------------- 0.9/24.0 MB 3.0 MB/s eta 0:00:08
     --- ------------------------------------ 1.9/24.0 MB 4.1 MB/s eta 0:00:06
     --- ------------------------------------ 2.1/24.0 MB

In [1]:
import logging
import sys
from pathlib import Path

from gensim.corpora import WikiCorpus

##### 導入維基語庫並生成文字檔

In [2]:
import opencc
CONVERTER_T2S = opencc.OpenCC("t2s.json")
CONVERTER_S2T = opencc.OpenCC("s2t.json")

In [3]:
def do_st_corrections(txt: str) -> str:
    simplified = CONVERTER_T2S.convert(txt)

    return CONVERTER_S2T.convert(simplified)

In [4]:
def write_wiki_text():
    doc_path = f"models/zhwiki-20221220-pages-articles-multistream.xml.bz2"

    if not Path(doc_path).exists:
        print("The file doesn't exist.")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(doc_path, dictionary={})
    texts_num = 0

    wiki_text_path = f"data/wiki-text/wiki_texts.txt"
    with open(wiki_text_path,'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            line = ' '.join(text) + '\n'
            do_st_corrections(line)
            output.write(line)
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)

In [6]:
write_wiki_text()

2023-05-01 17:44:04,062 : INFO : 已處理 10000 篇文章
2023-05-01 17:44:41,284 : INFO : 已處理 20000 篇文章
2023-05-01 17:45:13,860 : INFO : 已處理 30000 篇文章
2023-05-01 17:45:45,476 : INFO : 已處理 40000 篇文章
2023-05-01 17:46:15,586 : INFO : 已處理 50000 篇文章
2023-05-01 17:46:44,456 : INFO : 已處理 60000 篇文章
2023-05-01 17:47:12,635 : INFO : 已處理 70000 篇文章
2023-05-01 17:47:39,509 : INFO : 已處理 80000 篇文章
2023-05-01 17:48:05,755 : INFO : 已處理 90000 篇文章
2023-05-01 17:48:31,605 : INFO : 已處理 100000 篇文章
2023-05-01 17:48:59,297 : INFO : 已處理 110000 篇文章
2023-05-01 17:49:29,595 : INFO : 已處理 120000 篇文章
2023-05-01 17:49:55,279 : INFO : 已處理 130000 篇文章
2023-05-01 17:50:23,820 : INFO : 已處理 140000 篇文章
2023-05-01 17:50:50,636 : INFO : 已處理 150000 篇文章
2023-05-01 17:51:19,447 : INFO : 已處理 160000 篇文章
2023-05-01 17:51:46,161 : INFO : 已處理 170000 篇文章
2023-05-01 17:52:13,707 : INFO : 已處理 180000 篇文章
2023-05-01 17:52:40,762 : INFO : 已處理 190000 篇文章
2023-05-01 17:54:02,566 : INFO : 已處理 200000 篇文章
2023-05-01 17:54:32,744 : INFO : 已處理 210000 篇文章
2

處理繁簡轉換

In [8]:
import jieba

def tokenize():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    jieba.set_dictionary('data/jieba_dict/dict.txt.big')

    # load stopwords set
    stopword_set = set()
    with open('data/jieba_dict/stopwords.txt','r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    output = open('data/wiki-text/wiki_seg.txt', 'w', encoding='utf-8')
    with open('data/wiki-text/wiki_texts.txt', 'r', encoding='utf-8') as content :
        for texts_num, line in enumerate(content):
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopword_set:
                    output.write(word + ' ')
            output.write('\n')

            if (texts_num + 1) % 10000 == 0:
                logging.info("已完成前 %d 行的斷詞" % (texts_num + 1))
    output.close()

In [9]:
tokenize()

Building prefix dict from /home/P78081057/Gaber_AICUP2023/data/jieba_dict/dict.txt.big ...
2023-05-01 18:11:07,962 : DEBUG : Building prefix dict from /home/P78081057/Gaber_AICUP2023/data/jieba_dict/dict.txt.big ...
Dumping model to file cache /tmp/jieba.ua00b00166cb119b323a586144d426557.cache
2023-05-01 18:11:08,718 : DEBUG : Dumping model to file cache /tmp/jieba.ua00b00166cb119b323a586144d426557.cache
Loading model cost 0.823 seconds.
2023-05-01 18:11:08,785 : DEBUG : Loading model cost 0.823 seconds.
Prefix dict has been built successfully.
2023-05-01 18:11:08,786 : DEBUG : Prefix dict has been built successfully.
2023-05-01 18:12:30,501 : INFO : 已完成前 10000 行的斷詞
2023-05-01 18:13:34,241 : INFO : 已完成前 20000 行的斷詞
2023-05-01 18:14:30,429 : INFO : 已完成前 30000 行的斷詞
2023-05-01 18:15:23,831 : INFO : 已完成前 40000 行的斷詞
2023-05-01 18:16:12,828 : INFO : 已完成前 50000 行的斷詞
2023-05-01 18:16:58,906 : INFO : 已完成前 60000 行的斷詞
2023-05-01 18:17:44,180 : INFO : 已完成前 70000 行的斷詞
2023-05-01 18:18:27,094 : INFO 

訓練詞向量 (要微調)

In [10]:
from gensim.models import word2vec

def w2v():
    w2v_model_path = "models/w2v.zh.300/word2vec.model"
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("data/wiki-text/wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, vector_size=300)

    #保存模型，供日後使用
    model.save(w2v_model_path)

In [14]:
w2v_model_path = "models/w2v.zh.300/word2vec.model"
w2v()

2023-05-01 19:47:34,322 : INFO : collecting all words and their counts
2023-05-01 19:47:34,324 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-05-01 19:47:36,803 : INFO : PROGRESS: at sentence #10000, processed 10883244 words, keeping 716066 word types
2023-05-01 19:47:38,758 : INFO : PROGRESS: at sentence #20000, processed 19261579 words, keeping 1034611 word types
2023-05-01 19:47:40,513 : INFO : PROGRESS: at sentence #30000, processed 26647223 words, keeping 1263719 word types
2023-05-01 19:47:42,239 : INFO : PROGRESS: at sentence #40000, processed 33628724 words, keeping 1462877 word types
2023-05-01 19:47:43,782 : INFO : PROGRESS: at sentence #50000, processed 40070791 words, keeping 1630575 word types
2023-05-01 19:47:45,225 : INFO : PROGRESS: at sentence #60000, processed 46030839 words, keeping 1781780 word types
2023-05-01 19:47:46,656 : INFO : PROGRESS: at sentence #70000, processed 51933476 words, keeping 1917327 word types
2023-05-01 19:47:48

In [24]:
# w2v_model.wv.most_similar(['英國'])         # Most Similar
# w2v_model.wv.similarity('英國', '法國')   # Similarity

[('蘇格蘭', 0.7412448525428772),
 ('愛爾蘭', 0.729009747505188),
 ('英格蘭', 0.6906564831733704),
 ('美國', 0.6804454922676086),
 ('法國', 0.6544962525367737),
 ('澳大利亞', 0.6538345813751221),
 ('倫敦', 0.6519836783409119),
 ('英國政府', 0.6437056660652161),
 ('北愛爾蘭', 0.6304891109466553),
 ('紐西蘭', 0.6292421221733093)]