# 詞典設定

In [1]:
import jieba
import jieba.posseg as pseg
from collections import Counter
import pandas as pd
import numpy as np

In [2]:
#繁體中文專用字典
jieba.set_dictionary('dict_txt/dict.txt.big.txt')

In [3]:
#自定義字典對應我們的文本
jieba.load_userdict('dict_txt/my_dict.txt') # file_name 为文件类对象或自定义词典的路径
with open('dict_txt/my_dict.txt', 'r', encoding='utf8') as my:
    my_value = my.read().split('\n')

Building prefix dict from /Users/Kang/Documents/碩一下學期/石百達 金融科技-文字探勘與機器學習 /Week 5,6 HW2/read_pdf_and_jieba/dict_txt/dict.txt.big.txt ...
Loading model from cache /var/folders/mk/shxltk6j1kj5l0br45b5kw2c0000gn/T/jieba.u4a8f65a48ce67e60158517fa9f401191.cache
Loading model cost 1.280 seconds.
Prefix dict has been built succesfully.


In [4]:
#自定義庭用字用以去除與文本分析不相干之關鍵字
with open('dict_txt/stop_words.txt', 'r', encoding='utf8') as w:
    stops = w.read().split('\n') 
    stops.append('\n')
    stops.append('\n\n')
    stops.append('\x0c')
    stops.append('（')
    stops.append('）')

In [5]:
#範例
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式
 
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式
 
seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))
 
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))

Full Mode: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
Default Mode: 我/ 来到/ 北京/ 清华大学
他, 来到, 了, 网易, 杭研, 大厦
小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, ，, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造


# 主要運作函式

In [6]:
#以jieba詞性分析找出人名、地名、組織名、專有名
def get_noun(t):
    noun_list = []
    word = pseg.cut(t)
    for w in word:
        if w.flag in ["nr", "ns", "nt", "nz"]:
            if w.word not in noun_list:
                noun_list.append(w.word)
    return noun_list

In [7]:
#藉由比對文本中出現最多次的RR，判斷本基金風險測度
def find_risk(sort):
    risk_dict = {'RR1' : 0, 'RR2' : 0, 'RR3' : 0, 'RR4' : 0, 'RR5' : 0}
    for i in risk_dict.keys():
        for j in sort:
            if i == j[0]:
                risk_dict[i] = j[1]         
    return list(risk_dict.keys())[list(risk_dict.values()).index(max(risk_dict.values()))]

In [8]:
#一、將出現次數2以上且包含兩個以上字元的關鍵字留下
#二、對於出現1次者，若出現在自定義辭典也留下
#三、出現１次者，若屬人名、地名、專有名、組織名，也留下
def mining(sort, t):
    tem_list = []
    noun_list = get_noun(t)
    for i in sort:
        if (i[1] >= 2) & (len(i[0]) > 1):
            tem_list.append(i)
        elif i[0] in my_value:
            tem_list.append(i)
        elif (i[0] in noun_list) & (len(i[0]) > 1):
            tem_list.append(i)
        else:
            pass
    return tem_list

In [9]:
#處理產生出來的關鍵字，將他們丟到字典裡，之後用以生成矩陣
def extend_dict_for_df(etf_counter, important, dict_for_df):
    for i in range(len(important)):
        if important[i][0] in exist_key:
            dict_for_df[important[i][0]].append(important[i][1])
        else:
            dict_for_df[important[i][0]] = []
            for j in range(etf_counter - 1):
                dict_for_df[important[i][0]].append(0)
            dict_for_df[important[i][0]].append(important[i][1])
    if etf_counter > 1:
        for k in exist_key:
            if len(dict_for_df[k]) < etf_counter:
                dict_for_df[k].append(0)

# 迴圈運行區

In [10]:
#要讀取的文本位置，以及所有ETF名稱list
my_text_path = pd.read_csv('txt_path.csv', encoding = 'big5hkscs')
ETF_name = []
for i in range(len(my_text_path)):
    ETF_name.append(my_text_path['0'][i][91:-4])

In [11]:
risk_dict = {}
dict_for_df = {}
for etf_counter in range(1, len(my_text_path) + 1):
    path = my_text_path['0'][etf_counter - 1]
    
    #打開文本，讀取
    with open(path, 'r') as f:
        text = f.read()
        text = text.replace('\n', '')
        text = text.replace('\x0c', '')
        
        #第一步：先切詞，如不在停用字典中就列出，並將之降冪排序
        terms = (i for i in jieba.cut(text, cut_all=False) if i not in stops)
        
        ## 這個寫法很常出現在Ｃounter中，他可以排序，list每個item出現的次數。
        my_sort = sorted(Counter(terms).items(), key=lambda x:x[1], reverse=True)
        
        #紀錄本基金風險測度
        risk_dict[ETF_name[etf_counter - 1]] = find_risk(my_sort)
        
        #第二步：帶入先前函式，列出真正有用的關鍵字
        important = mining(my_sort, text)
        
        #第三步：處理產生出來的關鍵字，將他們丟到字典裡，之後用以生成矩陣
        exist_key = dict_for_df.keys()
        extend_dict_for_df(etf_counter, important, dict_for_df)

# 產生結果矩陣

In [12]:
#生成稀疏矩陣
sparse_m = pd.DataFrame(dict_for_df)
np.savetxt('csv/sparse_m.csv', sparse_m, fmt = '%d', delimiter = ',', header = ','.join(exist_key), encoding = 'big5hkscs')

In [13]:
#生成共現矩陣
cooccurance_m = np.dot(sparse_m.T, sparse_m)
np.savetxt('csv/cooccurance_m.csv', cooccurance_m, fmt = '%d', delimiter = ',', header = ','.join(exist_key), encoding = 'big5hkscs')

In [14]:
#輸出ETF風險測度
risk_output = pd.DataFrame.from_dict(risk_dict, orient='index')
risk_output.to_csv('csv/yuanta_ETF_risk.csv', encoding = 'big5hkscs')

In [15]:
#為稀疏矩陣加上row name
dg = pd.read_csv('csv/sparse_m.csv', encoding = 'big5hkscs')
dg['ETF_name'] = ETF_name
dg.set_index('ETF_name', inplace = True)
dg.to_csv('csv/sparse_m_has_row.csv', encoding = 'big5hkscs')

In [16]:
#為共現矩陣加上row name
dh = pd.read_csv('csv/cooccurance_m.csv', encoding = 'big5hkscs')
dh['keyword'] = dh.columns.tolist()
dh.set_index('keyword', inplace = True)
dh.to_csv('csv/cooccurance_m_has_row.csv', encoding = 'big5hkscs')

# 矩陣示意

In [17]:
sparse_m.head()

Unnamed: 0,投資,風險,本基金,基金,股票,國內,公司股票,上市,證券,產業,...,成立,經濟建設,外銷,營收,金鑽獎,market,中型,BBB,位於,介於
0,20,11,10,8,7,6,5,4,4,4,...,0,0,0,0,0,0,0,0,0,0
1,22,16,12,7,6,0,0,1,2,2,...,0,0,0,0,0,0,0,0,0,0
2,22,15,12,7,2,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,18,14,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
4,17,13,9,6,0,3,0,2,4,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
risk_output.head()

Unnamed: 0,0
元大多多基金,RR4
元大印度指數基金,RR5
元大全球公用能源效率基金-不配息,RR4
元大10年期以上美元投資級銀行債券ETF基金,RR2
元大得利貨幣市場基金,RR1


In [19]:
dg.head()

Unnamed: 0_level_0,# 投資,風險,本基金,基金,股票,國內,公司股票,上市,證券,產業,...,成立,經濟建設,外銷,營收,金鑽獎,market,中型,BBB,位於,介於
ETF_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
元大多多基金,20,11,10,8,7,6,5,4,4,4,...,0,0,0,0,0,0,0,0,0,0
元大印度指數基金,22,16,12,7,6,0,0,1,2,2,...,0,0,0,0,0,0,0,0,0,0
元大全球公用能源效率基金-不配息,22,15,12,7,2,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
元大10年期以上美元投資級銀行債券ETF基金,18,14,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
元大得利貨幣市場基金,17,13,9,6,0,3,0,2,4,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
dh.head()

Unnamed: 0_level_0,# 投資,風險,本基金,基金,股票,國內,公司股票,上市,證券,產業,...,成立,經濟建設,外銷,營收,金鑽獎,market,中型,BBB,位於,介於
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
# 投資,66909,42827,32902,20655,7715,1135,394,1343,4113,4589,...,175,105,105,105,35,30,72,135,30,30
風險,42827,29557,20230,12774,5129,626,201,872,3223,2952,...,65,39,39,39,13,28,68,126,28,28
本基金,32902,20230,18019,11131,3771,590,212,682,1446,2356,...,90,54,54,54,18,0,36,0,0,0
基金,20655,12774,11131,8401,2485,554,158,460,1173,1591,...,90,54,54,54,18,0,28,0,0,0
股票,7715,5129,3771,2485,2167,234,119,383,584,923,...,35,21,21,21,7,0,12,0,0,0
