In [1]:
import numpy as np
import pandas as pd
import datetime

In [12]:
# 寫入大盤
stock_rate_table = pd.read_csv('economic/stockrate/^TWII.csv',encoding="cp1252")
stock_rate_table.columns =  ['日期','開市大盤','大盤最高','大盤最低','收市大盤','收市大盤經調整','成交量']

# 寫入貨幣匯率
exchange_table = pd.read_csv('economic/exchange/200610_202210.csv',encoding="cp1252")
exchange_table.columns = ['日期','美元／新台幣','人民幣／新台幣','歐元／美元','美元／日幣','英鎊／美元','澳幣／美元','美元／港幣','美元／人民幣','美元／南非幣','紐幣／美元']

# 將大盤和匯率合併為econimic_table
econimic_table = pd.merge(stock_rate_table,exchange_table, how='left',on='日期')

#載入cliwc解析結果，並將日期以文字格式另寫到其他column，以便後續合併
cliwc_parse_table = pd.read_csv('PttStock_CliwcParsed.csv')
cliwc_parse_table['日'] = cliwc_parse_table['日'].astype(str)
cliwc_parse_table['日'] = cliwc_parse_table['日'].replace(['1','2','3','4','5','6','7','8','9'],['01','02','03','04','05','06','07','08','09'])
cliwc_parse_table['月'] = cliwc_parse_table['月'].replace(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'],['01','02','03','04','05','06','07','08','09','10','11','12'])
cliwc_parse_table['年'] = cliwc_parse_table['年'].astype(int)
cliwc_parse_table['日期']= cliwc_parse_table['日']+"/"+cliwc_parse_table['月'].astype(str)+"/"+cliwc_parse_table['年'].astype(str)
#cliwc另將日期轉換為星期寫到其他column，同時將月、日轉為整數型式，以便後續分析。
cliwc_parse_table['星期'] = pd.to_datetime(cliwc_parse_table['日期'], format='%d/%m/%Y').dt.dayofweek
cliwc_parse_table = cliwc_parse_table.drop(['作者', '標題','時'], axis=1)
cliwc_parse_table['月'] = cliwc_parse_table['月'].astype(int)
cliwc_parse_table['日'] = cliwc_parse_table['日'].astype(int)


###將cliwc解析結果與econimic_table合併為content_table###

#將cliwc以日期分組，以把cliwc每天文章與留言的content分數加總
content_sum = cliwc_parse_table[cliwc_parse_table.columns[3:-1]].groupby('日期').sum()
#將cliwc日期相關的column以日期分組，讓各項資料不重複
content_week = cliwc_parse_table.groupby('日期').mean()[['星期']]
content_year = cliwc_parse_table.groupby('日期').mean()[['年']]
content_month = cliwc_parse_table.groupby('日期').mean()[['月']]
content_date = cliwc_parse_table.groupby('日期').mean()[['日']]
#合併時間相關資料表與cliwc資料
content = content_sum.join(content_week, on='日期')
content = content.join(content_year, on='日期')
content = content.join(content_month, on='日期')
content = content.join(content_date, on='日期')
#把econimic_table需要的column取出，並將當天收市大盤column向前平移，以讓隔天的收市大盤放到當天
e = econimic_table[['日期', '美元／新台幣', '收市大盤']]
e = e.fillna(method = 'ffill')
e['隔日收市大盤'] = list(e['收市大盤'])[1:]+[list(e['收市大盤'])[-1]] #13409為最後一筆資料，當天的大盤指數
e['前日收市大盤'] = [list(e['收市大盤'])[0]]+list(e['收市大盤'])[0:-1] #13409為最後一筆資料，當天的大盤指數

#把content和econimic_table以日期合併(讓econimic_table多餘的資料被移除)
content = pd.merge(content,e, how='left', on='日期')
content.index = content['日期']
content = content.iloc[:,1:]
content.index = pd.to_datetime(content.index, format='%d/%m/%Y')
content = content.sort_index()

# ###分析資料整理###

#匯率在假日時會沒有資料，所以以前面最後一天有資料填補
content['美元／新台幣'] = content['美元／新台幣'].fillna(method = 'ffill')
#大盤在假日時會沒有資料，所以以前面最後一天有資料填補
content['收市大盤'] = content['收市大盤'].fillna(method = 'ffill')
content['隔日收市大盤'] = content['隔日收市大盤'].fillna(method = 'ffill')
content['前日收市大盤'] = content['前日收市大盤'].fillna(method = 'ffill') ##
#計算出隔日收市大盤的漲跌，作為後續預測的label
content['今日收市大盤漲跌'] = content['收市大盤']  - content['前日收市大盤']
content['隔日收市大盤漲跌'] = content['隔日收市大盤']  -content['收市大盤'] ##
#最後，因為週五及週六，隔天都沒有開市，可能會造成分析誤差，所以將週五週六的資料移除。(以週日預測週一)
content = content.drop(['隔日收市大盤', '前日收市大盤'], axis=1)

#
start = pd.to_datetime('2007-01-01')
content.index -= start
content.index = content.index.astype(str) 
content.index = content.index.str.replace(' days','')
content.index = content.index.astype(int) 

content.index
content1 = content.copy()
idxlist = list(content.index)
for i in range(len(idxlist)):                   #i = 5749  位置3277
    crit = idxlist[i] - 7                       #crit = 5742 位置3270
    
    table = content[content.index>crit]
    table = table[table.index<=idxlist[i]]
    table = table.iloc[:,1:161]
    content1.iloc[i,1:161] = table.sum()
content = content1

#將資料分散，以隨機分布，取1/10作為valid用，9/10作為train用
content = content.sample(frac=1).reset_index(drop=True)
content_valid = content.iloc[:(content.shape[0])//10,:]
content_train = content.iloc[(content.shape[0])//10:,:]

#寫出data
content.to_csv('data1011test.csv', encoding='UTF-8-Sig')
content_valid.to_csv('data1011valid.csv', encoding='UTF-8-Sig')
content_train.to_csv('data1011train.csv', encoding='UTF-8-Sig')

In [13]:
content

Unnamed: 0_level_0,推數,噓數,→數,content_function,content_pronoun,content_ppron,content_i,content_we,content_you,content_shehe,...,message_nonflu,message_filler,星期,年,月,日,美元／新台幣,收市大盤,今日收市大盤漲跌,隔日收市大盤漲跌
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
206,24,0,28,144,8,3,2,1,0,0,...,2,4,3.0,2007.0,7.0,26.0,32.8230,9566.419922,-173.709961,-404.139649
214,7,1,1,284,2,0,0,0,0,0,...,2,2,4.0,2007.0,8.0,3.0,32.8610,9057.820313,107.250000,-116.089844
220,26,1,47,413,2,0,0,0,0,0,...,8,9,3.0,2007.0,8.0,9.0,32.8932,9182.599609,83.139648,-251.290039
226,11,1,50,812,15,2,2,0,0,0,...,9,7,2.0,2007.0,8.0,15.0,32.9965,8593.040039,-317.950195,-391.669922
231,29,2,5,1074,19,3,2,0,0,1,...,3,2,0.0,2007.0,8.0,20.0,32.8792,8515.599609,425.309570,-36.519531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5746,2104,4822,26395,114196,4114,1951,791,136,512,403,...,9716,12189,6.0,2022.0,9.0,25.0,31.6650,14118.379880,-166.250000,-340.189450
5747,7051,5153,28054,122726,4380,2089,861,160,524,437,...,10237,12862,0.0,2022.0,9.0,26.0,31.8180,13778.190430,-340.189450,48.399410
5748,5674,5705,30051,128944,4656,2215,941,168,554,444,...,10905,13913,1.0,2022.0,9.0,27.0,31.7800,13826.589840,48.399410,-360.519530
5749,7445,6787,32681,136050,4778,2251,991,195,481,475,...,11903,15225,2.0,2022.0,9.0,28.0,31.8710,13466.070310,-360.519530,68.189460
