# Package

In [6]:
#資料處理
import numpy as np                     #資料處理2陣列
import pandas as pd                    #資料處理1資料框
from math import nan

#統計
import random                          #指定隨機狀態1
random.seed(123)                       #指定隨機狀態2

#時間
import datetime

#其他
from tqdm import tqdm, trange          #用來掌管 for 進度
import os                              #用來控制路徑
import warnings                        #用來消除警告
warnings.filterwarnings("ignore")      #用來消除警告設定

#文字處理
import monpa
from monpa import utils
import re

# 平行化
from multiprocessing import Pool
from multiprocessing.dummy import  Pool as ThreadPool

# Path
rawdata_path = '/Users/alexlo/Desktop/Project/Project_MLEM/rawdata'
workdata_path = '/Users/alexlo/Desktop/Project/Project_MLEM/workdata'

# Download Stock

In [None]:
from etl_func.stock_data import get_StockDF

os.chdir(rawdata_path)
stock_df = get_StockDF('2303')

os.chdir(workdata_path)
stock_df.to_parquet('聯電.parquet')

# Download Article 

## Main Function

In [23]:
def remove_nonChinese(sentence):
    return re.sub(r'[^\u4e00-\u9fa5]+', '', sentence)

def shorten_word(word_list: list) -> list:
    for word in word_list:
        if word.startswith(('一', '二','三','四','五','六','七','八','九','十')):
            word_list.remove(word)
            continue
        if len(word) >= 5:
            word_list.remove(word)
            word_list.append(word[:2])
            word_list.append(word[2:])
    return word_list
    
def combine_np_files(stock_code):
    '''
    Combine all the np files startwith 'stock_code'
    '''
    os.chdir(workdata_path)
    file_list = [file for file in os.listdir() if file.startswith(stock_code)]
    words_array = np.array([])
    for file in file_list:
        words_array = np.append(words_array, np.load(file))
        words_array = np.array(words_array)
        os.remove(file)
        np.save(stock_code + '_words', words_array)
    return words_array

def drop_bda2022(stock_code):
    '''
    os 刪除所有bda2022開頭的檔案，測試使用
    '''
    os.chdir(workdata_path)
    for file in os.listdir():
        if file.startswith(stock_code):
            os.remove(file)

class Words_Dataset:
    def __init__(self, data_source, stock_code, start_date, end_date):
        os.chdir(rawdata_path)
        self.data_source = data_source
        self.stock_code = stock_code
        self.article_df = pd.read_csv(data_source)
        self.start_date = start_date 
        self.end_date = end_date
        
    def select_article(self, keywords, title_times, content_times):
        self.article_df['post_time'] = pd.to_datetime(self.article_df['post_time']).dt.date
        self.article_df = self.article_df[(self.article_df['title'].str.count('|'.join(keywords)) >= title_times) |
                                (self.article_df['content'].str.count('|'.join(keywords)) >= content_times)]. \
                                reset_index(drop = True)
        return self.article_df
        
    def get_indexlist_for_multi(self):
        num_rows = self.article_df.shape[0]
        start_index_list = [i*50 for i in range(int(num_rows / 50)+1)] 
        end_index_list = [(i+1)*50 for i in range(int(num_rows / 50))]
        end_index_list.append(num_rows)
        index_list = [ [start_index_list[i], end_index_list[i]] for i in range(len(start_index_list)) ]
        return index_list

    def get_words_matrix(self, index):
        print('Running:', index)
        df = self.article_df.iloc[index[0]:index[1]].reset_index(drop=True)
        article_into_words_list = []
        len_of_df = df.shape[0]
        error_times = 0
        for i in range(0, len_of_df): #幾篇文章就要跑幾次
            article_into_words = str()
            try:    
                article = df['content'][i]
                sentence_list = utils.short_sentence(article) #先把一篇文章切成很多句
                for sentence in sentence_list: #再針對每一句切成很多個字
                    sentence = remove_nonChinese(sentence)
                    word_list = monpa.cut_batch(sentence)
                    word_list = word_list[0] #為了batch
                    if word_list is not None:
                        word_list = shorten_word(word_list)
                        article_into_words += ' '.join(word_list)
                article_into_words_list.append(article_into_words)
            except:
                error_times += 1
                article_into_words_list.append('')
        if error_times > 0:
            print(f'Error Times in {index}:', error_times)
        # Save
        os.chdir(workdata_path)
        words_array = np.array(article_into_words_list)
        save_name = stock_code + '_' + (self.data_source.split('_')[2] + self.data_source.split('_')[3])[:-4] + '_words_' + str(index[0])
        np.save(save_name, words_array)
        return error_times
    
    def get_words_matrix_multi(self, index_list):
        pool = ThreadPool(8)
        error_times = pool.map(self.get_words_matrix, index_list)
        pool.close()
        pool.join()


## Parameters

In [12]:
start_date = datetime.date(2019,1,1) 
end_date = datetime.date(2021,1,1) 
data_time = (start_date, end_date)
a, b = data_time
b

datetime.date(2021, 1, 1)

In [3]:
# article source
bbs = 'bda2022_mid_bbs_2019-2021.csv'
forum2019 = 'bda2022_mid_forum_2019.csv' 
forum2020 = 'bda2022_mid_forum_2020.csv'
forum2021 = 'bda2022_mid_forum_2021.csv'
news2019 = 'bda2022_mid_news_2019.csv'
news2020 = 'bda2022_mid_news_2020.csv'
news2021 = 'bda2022_mid_news_2021.csv'

# parameters
start_date = datetime.date(2019,1,1) 
end_date = datetime.date(2021,1,1) 
stock_code = '2303'
keywords = ['聯電'] #公司名字（各種可能名字）、產業名字
title_times = 1
content_times = 3

In [26]:
# main
for data_source in [forum2019, forum2020, forum2021, news2019, news2020, news2021, bbs]:
    print(f"Data Source: {data_source}")
    words_dataset = Words_Dataset(data_source, stock_code, start_date, end_date)
    article_df = words_dataset.select_article(keywords, title_times, content_times)
    print(f"There are {article_df.shape[0]: ,} artilces after selecting.")
    index_list = words_dataset.get_indexlist_for_multi()
    words_dataset.get_words_matrix_multi(index_list)
    print(f"Finished: {data_source}")

words_array = combine_np_files('2303')

Data Source: bda2022_mid_forum_2019.csv
There are  165 artilces after selecting.
Running: [0, 50]
Running: [50, 100]
Running: [100, 150]
Running: [150, 165]
Error Times in [50, 100]: 1
Error Times in [100, 150]: 2
Error Times in [0, 50]: 3
Finished: bda2022_mid_forum_2019.csv
Data Source: bda2022_mid_forum_2020.csv
There are  361 artilces after selecting.
Running: [0, 50]
Running: [50, 100]
Running: [100, 150]
Running: [150, 200]
Running: [200, 250]
Running: [250, 300]
Running: [300, 350]
Running: [350, 361]
Error Times in [300, 350]: 1
Error Times in [0, 50]: 3
Error Times in [200, 250]: 2
Finished: bda2022_mid_forum_2020.csv
Data Source: bda2022_mid_forum_2021.csv
There are  600 artilces after selecting.
Running: [0, 50]
Running: [50, 100]
Running: [100, 150]
Running: [150, 200]
Running: [200, 250]
Running: [250, 300]
Running: [300, 350]
Running: [350, 400]
Running: [400, 450]
Running: [450, 500]
Error Times in [400, 450]: 1
Running: [500, 550]
Error Times in [0, 50]: 1
Running: [550

## Read Stock & Article

In [34]:
### 設定的參數（有需要的話，可以直接修改成股票代碼以及股票檔案）
code = stock_code
keywords = ['聯電'] #the keyword to filter the articles
keywords_times_titles = 1
keywords_times_content = 2


### 讀取文章資料
os.chdir(rawdata_path)
article_df = pd.read_csv('bda2022_mid_bbs_2019-2021.csv')
article_df = article_df.rename(columns={'post_time':'Post_Time', 'title':'Title', 'content':'Content'})
article_df['Post_Time'] = pd.to_datetime(article_df['Post_Time']).dt.date
criteria1 = (article_df['Title'].str.count('|'.join(keywords)) >= keywords_times_titles)
criteria2 = (article_df['Content'].str.count('|'.join(keywords)) >= keywords_times_content)
article_df = article_df[(criteria1) | (criteria2)].reset_index(drop = True)

### 讀取文字資料
os.chdir(workdata_path)
all_words = np.load(code + '_words.npy').tolist()
print("There are", len(all_words), "articles in total.")

### 讀取股價資料
stock_df = pd.read_parquet('聯電.parquet').astype({'Date':'datetime64'})

There are 5230 articles in total.


In [39]:
# stock_df
all_words[1:5]

['很多 公司 都 是 這樣 搞 的 啊 美其名 激勵 員工 但 其實 分配 到 底層 員工 能 認 個 張 就算 多 的 了 絕 大多數 都 是 高層 分配 走 了',
 '這 很 正常 啊 他 是 分批 買進 最 低 有 多元 買 所以 平均 大約 元 左右 這 個 時候 開放 員工 認購 正好 可以 激勵 員工 員工 也 不 是 無償 取得 之後 依 員工 職級 與 年資 來 計算 可 認購 張數 很多 公司 都 是 這樣 做',
 '股東 不能 認 股 真的 跟 大同 沒 兩樣 股東 喝 西北風 你們 這些 出錢 的 最 小 拿到 經營權 再 跟 公司 說',
 '站 在 非 股東 立場 來 看 為何 不 是 全體 含 基層 員工 認購 呢 只 有 主管 階層 能 套利 果然 基層 該 死 吃相 更 難 看']