In [1]:
%pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.3.1-cp310-cp310-win_amd64.whl (24.0 MB)
     ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/24.0 MB 1.3 MB/s eta 0:00:19
     ---------------------------------------- 0.1/24.0 MB 1.3 MB/s eta 0:00:19
     ---------------------------------------- 0.2/24.0 MB 1.6 MB/s eta 0:00:16
      --------------------------------------- 0.3/24.0 MB 1.8 MB/s eta 0:00:14
      --------------------------------------- 0.4/24.0 MB 1.8 MB/s eta 0:00:14
     - -------------------------------------- 0.7/24.0 MB 2.4 MB/s eta 0:00:10
     - -------------------------------------- 0.9/24.0 MB 3.0 MB/s eta 0:00:08
     - -------------------------------------- 0.9/24.0 MB 3.0 MB/s eta 0:00:08
     - -------------------------------------- 0.9/24.0 MB 3.0 MB/s eta 0:00:08
     --- ------------------------------------ 1.9/24.0 MB 4.1 MB/s eta 0:00:06
     --- ------------------------------------ 2.1/24.0 MB

In [1]:
import logging
import sys
from pathlib import Path

from gensim.corpora import WikiCorpus

##### 導入維基語庫並生成文字檔

In [2]:
import opencc
CONVERTER_T2S = opencc.OpenCC("t2s.json")
CONVERTER_S2T = opencc.OpenCC("s2t.json")

In [3]:
def do_st_corrections(txt: str) -> str:
    simplified = CONVERTER_T2S.convert(txt)

    return CONVERTER_S2T.convert(simplified)

In [4]:
def write_wiki_text():
    doc_path = f"models/zhwiki-20221220-pages-articles-multistream.xml.bz2"

    if not Path(doc_path).exists:
        print("The file doesn't exist.")
        exit()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    wiki_corpus = WikiCorpus(doc_path, dictionary={})
    texts_num = 0

    wiki_text_path = f"data/wiki-text/wiki_texts.txt"
    with open(wiki_text_path,'w',encoding='utf-8') as output:
        for text in wiki_corpus.get_texts():
            line = ' '.join(text) + '\n'
            do_st_corrections(line)
            output.write(line)
            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已處理 %d 篇文章" % texts_num)

In [5]:
write_wiki_text()



處理繁簡轉換

In [29]:
import jieba

def tokenize():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    jieba.set_dictionary('data/jieba_dict/dict.txt.big')

    # load stopwords set
    stopword_set = set()
    with open('data/jieba_dict/stopwords.txt','r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    output = open('data/wiki-text/wiki_seg.txt', 'w', encoding='utf-8')
    with open('data/wiki-text/wiki_texts.txt', 'r', encoding='utf-8') as content :
        for texts_num, line in enumerate(content):
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopword_set:
                    output.write(word + ' ')
            output.write('\n')

            if (texts_num + 1) % 10000 == 0:
                logging.info("已完成前 %d 行的斷詞" % (texts_num + 1))
    output.close()

In [30]:
tokenize()

Building prefix dict from d:\VSCodeProject\NCKU-AICUP2023-baseline\data\jieba_dict\dict.txt.big ...
2023-04-20 16:12:47,702 : DEBUG : Building prefix dict from d:\VSCodeProject\NCKU-AICUP2023-baseline\data\jieba_dict\dict.txt.big ...
Dumping model to file cache C:\Users\GABERI~1\AppData\Local\Temp\jieba.u155ec321d2357b8e7de4a678a68eea60.cache
2023-04-20 16:12:48,634 : DEBUG : Dumping model to file cache C:\Users\GABERI~1\AppData\Local\Temp\jieba.u155ec321d2357b8e7de4a678a68eea60.cache
Loading model cost 1.002 seconds.
2023-04-20 16:12:48,705 : DEBUG : Loading model cost 1.002 seconds.
Prefix dict has been built successfully.
2023-04-20 16:12:48,706 : DEBUG : Prefix dict has been built successfully.
2023-04-20 16:14:38,197 : INFO : 已完成前 10000 行的斷詞
2023-04-20 16:16:02,797 : INFO : 已完成前 20000 行的斷詞
2023-04-20 16:17:12,808 : INFO : 已完成前 30000 行的斷詞
2023-04-20 16:18:26,374 : INFO : 已完成前 40000 行的斷詞
2023-04-20 16:19:30,858 : INFO : 已完成前 50000 行的斷詞
2023-04-20 16:20:34,025 : INFO : 已完成前 60000 行的斷

訓練詞向量 (要微調)

In [10]:
from gensim.models import word2vec

def w2v():
    w2v_model_path = "models/w2v.zh.250/word2vec.model"
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence("data/wiki-text/wiki_seg.txt")
    model = word2vec.Word2Vec(sentences, vector_size=250)

    #保存模型，供日後使用
    model.save(w2v_model_path)

In [12]:
w2v_model_path = "models/w2v.zh.250/word2vec.model"
if Path(w2v_model_path).exists:
    w2v_model = word2vec.Word2Vec.load(w2v_model_path)
else:
    w2v()

2023-04-21 13:08:24,187 : INFO : loading Word2Vec object from models/w2v.zh.250/word2vec.model
2023-04-21 13:08:24,594 : INFO : loading wv recursively from models/w2v.zh.250/word2vec.model.wv.* with mmap=None
2023-04-21 13:08:24,595 : INFO : loading vectors from models/w2v.zh.250/word2vec.model.wv.vectors.npy with mmap=None
2023-04-21 13:08:25,026 : INFO : loading syn1neg from models/w2v.zh.250/word2vec.model.syn1neg.npy with mmap=None
2023-04-21 13:08:25,509 : INFO : setting ignored attribute cum_table to None
2023-04-21 13:08:32,348 : INFO : Word2Vec lifecycle event {'fname': 'models/w2v.zh.250/word2vec.model', 'datetime': '2023-04-21T13:08:32.348099', 'gensim': '4.3.1', 'python': '3.10.9 | packaged by Anaconda, Inc. | (main, Mar  8 2023, 10:42:25) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'loaded'}


In [24]:
# w2v_model.wv.most_similar(['英國'])         # Most Similar
# w2v_model.wv.similarity('英國', '法國')   # Similarity

[('蘇格蘭', 0.7412448525428772),
 ('愛爾蘭', 0.729009747505188),
 ('英格蘭', 0.6906564831733704),
 ('美國', 0.6804454922676086),
 ('法國', 0.6544962525367737),
 ('澳大利亞', 0.6538345813751221),
 ('倫敦', 0.6519836783409119),
 ('英國政府', 0.6437056660652161),
 ('北愛爾蘭', 0.6304891109466553),
 ('紐西蘭', 0.6292421221733093)]