In [1]:
import jieba
import jieba.posseg as pseg
from collections import Counter
import pandas as pd
import numpy as np

In [2]:
#繁體中文專用字典
jieba.set_dictionary('/Users/Kang/Downloads/dict.txt.big.txt')

In [3]:
#自定義字典對應我們的文本
jieba.load_userdict('/Users/Kang/Desktop/my_dict.txt') # file_name 为文件类对象或自定义词典的路径
with open('/Users/Kang/Desktop/my_dict.txt', 'r', encoding='utf8') as my:
    my_value = my.read().split('\n')

Building prefix dict from /Users/Kang/Downloads/dict.txt.big.txt ...
Loading model from cache /var/folders/mk/shxltk6j1kj5l0br45b5kw2c0000gn/T/jieba.u660098a6e58520093b1c91286d454106.cache
Loading model cost 1.050 seconds.
Prefix dict has been built succesfully.


In [4]:
#自定義庭用字用以去除與文本分析不相干之關鍵字
with open('stop_words.txt', 'r', encoding='utf8') as w:
    stops = w.read().split('\n') 
    stops.append('\n')
    stops.append('\n\n')
    stops.append('\x0c')
    stops.append('（')
    stops.append('）')

In [5]:
#範例
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式
 
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式
 
seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))
 
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))

Full Mode: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
Default Mode: 我/ 来到/ 北京/ 清华大学
他, 来到, 了, 网易, 杭研, 大厦
小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, ，, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造


In [6]:
#以jieba詞性分析找出人名、地名、組織名、專有名
def get_noun(t):
    noun_list = []
    word = pseg.cut(t)
    for w in word:
        if w.flag in ["nr", "ns", "nt", "nz"]:
            if w.word not in noun_list:
                noun_list.append(w.word)
    return noun_list

In [7]:
#藉由比對文本中出現最多次的RR，判斷本基金風險測度
def find_risk(sort):
    risk_dict = {'RR1' : 0, 'RR2' : 0, 'RR3' : 0, 'RR4' : 0, 'RR5' : 0}
    for i in risk_dict.keys():
        for j in sort:
            if i == j[0]:
                risk_dict[i] = j[1]         
    return list(risk_dict.keys())[list(risk_dict.values()).index(max(risk_dict.values()))]

In [8]:
#一、將出現次數2以上且包含兩個以上字元的關鍵字留下
#二、對於出現1次者，若出現在自定義辭典也留下
#三、出現１次者，若屬人名、地名、專有名、組織名，也留下
def mining(sort, t):
    tem_list = []
    noun_list = get_noun(t)
    for i in sort:
        if (i[1] >= 2) & (len(i[0]) > 1):
            tem_list.append(i)
        elif i[0] in my_value:
            tem_list.append(i)
        elif (i[0] in noun_list) & (len(i[0]) > 1):
            tem_list.append(i)
        else:
            pass
    return tem_list

In [9]:
#處理產生出來的關鍵字，將他們丟到字典裡，之後用以生成矩陣
def extend_dict_for_df(etf_counter, important, dict_for_df):
    for i in range(len(important)):
        if important[i][0] in exist_key:
            dict_for_df[important[i][0]].append(important[i][1])
        else:
            dict_for_df[important[i][0]] = []
            for j in range(etf_counter - 1):
                dict_for_df[important[i][0]].append(0)
            dict_for_df[important[i][0]].append(important[i][1])
    if etf_counter > 1:
        for k in exist_key:
            if len(dict_for_df[k]) < etf_counter:
                dict_for_df[k].append(0)

In [10]:
#要讀取的文本位置，以及所有ETF名稱list
my_text_path = pd.read_csv('/Users/Kang/Desktop/output_path.csv', encoding = 'big5hkscs')
ETF_name = []
for i in range(len(my_text_path)):
    ETF_name.append(my_text_path['0'][i][34:-4])

In [11]:
risk_dict = {}
dict_for_df = {}
for etf_counter in range(1, len(my_text_path) + 1):
#for etf_counter in range(1, 10 + 1):
    path = my_text_path['0'][etf_counter - 1]
    
    #打開文本，讀取
    with open(path, 'r') as f:
        text = f.read()
        text = text.replace('\n', '')
        text = text.replace('\x0c', '')
        
        #第一步：先切詞，如不在停用字典中就列出，並將之降冪排序
        terms = (i for i in jieba.cut(text, cut_all=False) if i not in stops)
        ## 這個寫法很常出現在Ｃounter中，他可以排序，list每個item出現的次數。
        my_sort = sorted(Counter(terms).items(), key=lambda x:x[1], reverse=True)
        
        #紀錄本基金風險測度
        risk_dict[ETF_name[etf_counter - 1]] = find_risk(my_sort)
        
        #第二步：帶入先前函式，列出真正有用的關鍵字
        important = mining(my_sort, text)
        
        #第三步：處理產生出來的關鍵字，將他們丟到字典裡，之後用以生成矩陣
        exist_key = dict_for_df.keys()
        extend_dict_for_df(etf_counter, important, dict_for_df)

In [12]:
#生成稀疏矩陣
sparse_m = pd.DataFrame(dict_for_df)
np.savetxt('/Users/Kang/Desktop/sparse_m.csv', sparse_m, fmt = '%d', delimiter = ',', header = ','.join(exist_key), encoding = 'big5hkscs')

In [13]:
#生成共現矩陣
cooccurance_m = np.dot(sparse_m.T, sparse_m)
np.savetxt('/Users/Kang/Desktop/cooccurance_m.csv', cooccurance_m, fmt = '%d', delimiter = ',', header = ','.join(exist_key), encoding = 'big5hkscs')

In [14]:
#輸出ETF風險測度
risk_output = pd.DataFrame.from_dict(risk_dict, orient='index')
risk_output.to_csv('/Users/Kang/Desktop/yuanta_ETF_risk.csv', encoding = 'big5hkscs')

In [15]:
#為稀疏矩陣加上row name
dg = pd.read_csv('/Users/Kang/Desktop/sparse_m.csv', encoding = 'big5hkscs')
dg['ETF_name'] = ETF_name
dg.set_index('ETF_name', inplace = True)
dg.to_csv('/Users/Kang/Desktop/sparse_m_1.csv', encoding = 'big5hkscs')

In [16]:
#為共現矩陣加上row name
dh = pd.read_csv('/Users/Kang/Desktop/cooccurance_m.csv', encoding = 'big5hkscs')
dh['keyword'] = dh.columns.tolist()
dh.set_index('keyword', inplace = True)
dh.to_csv('/Users/Kang/Desktop/cooccurance_m_1.csv', encoding = 'big5hkscs')