In [1]:
#!/usr/bin/python3
import os
import pandas as pd
from collections import OrderedDict
from sklearn.preprocessing import MinMaxScaler

In [2]:
%run common.ipynb

In [3]:
%time
pickle_df = pd.read_pickle('train.pkl')

print(pickle_df.shape, type(pickle_df))
print(pickle_df[15:20])

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.58 µs
(54947, 3) <class 'pandas.core.frame.DataFrame'>
                ari_time                                          ari_title  \
15  2006-10-30T22:48:25Z   Amerigroup Defrauded U.S., Must Pay $144 Million   
16  2006-11-01T13:45:11Z  Comstar to Spend $300 Million on Regional Purc...   
17  2006-11-01T16:28:21Z  Russian Stocks Gain the Most in Three Weeks; L...   
18  2006-11-01T21:19:28Z  ACE Must Lift Pay for Air Canada Pilots, Jazz ...   
19  2006-11-02T05:27:24Z  Coles Myer Raises Performance Targets for Bonuses   

    price  
15      1  
16      0  
17      0  
18      0  
19      0  


In [4]:
# 读取之前生成的news资料里的标题
# 并加上句号以及分割成一行一句格式
# 以便 reverb 工具来读取
file_path = "./reverb_pre.txt"
with open(file_path, "w+") as fo:
    for ari_title in pickle_df[:-1]["ari_title"]:
        fo.write(ari_title + ".\n")
print ("Saved:", fo.name)

Saved: ./reverb_pre.txt


In [5]:
# Call Reverb for OpenIE parsing
# Run command below before continuing on the next step:
#   java -Xmx512m -jar ./test-reverb/reverb.jar --quiet reverb_pre.txt > reverb_result.txt

In [6]:
%%time

# Create date-dataframe dict with all dates and empty title
date_df_dict = OrderedDict()
# [Checking] number of titles (processed by reverb) per day
date_titles_count = OrderedDict()
for ari_time, price in zip(pickle_df['ari_time'], pickle_df['price']):
    # Extract date & price
    date_t = ari_time[0:10]
    price_t = int(price)  # 1 or 0
    # Add dataframe with empty title
    date_df_dict[date_t] = [date_t, '', price_t]
    date_titles_count[date_t] = 0


# reverb tool 生成的结果文件,命令行生成的,带有额外杂讯
result_file_name = "reverb_result.txt"
file_content = ""
with open(result_file_name, 'r') as out:
    file_content = out.read()
res_content_list = file_content.split("\n")

# Remove last line (empty string)
del res_content_list[-1]

# 提取 reverb 结果文件中需要的三元组信息
src_line_numbers = []
reverb_res_list = []
for fc in range(0, len(res_content_list)):
    line_tokens = res_content_list[fc].split("\t")
    src_line_numbers.append(int(line_tokens[1]) - 1)
    reverb_res_list.append(line_tokens[-3:])

# reverb 并没有把所有news title 全部生成三元组格式
date_title_list = []  # titles within same day
last_date = ''
i = 0
for terms in log_progress(reverb_res_list):
    terms = reverb_res_list[i]
    src_line_number = src_line_numbers[i]
    
    # Extract date & price
    date_t = pickle_df["ari_time"][src_line_number][0:10]
    price_t = date_df_dict[date_t][2]
    # Join terms as a string
    ari_title_t = ' '.join(terms)
    
    # Concat titles in the same day
    if last_date != date_t:
        if len(date_title_list) > 0:
            date_titles_count[last_date] = len(date_title_list)
            title_aggr = ' '.join(date_title_list)
            date_df_dict[last_date] = [last_date, title_aggr, price_t]
            date_title_list = []
        last_date = date_t
    
    date_title_list.append(ari_title_t)
    i += 1

reverb_df_list = list(date_df_dict.values())

print('[post-reverb] Number of days: %d' % len(date_titles_count))
print('[post-reverb] Titles per day:', date_titles_count)
# print(reverb_df_list)

del date_titles_count

VBox(children=(HTML(value=''), IntProgress(value=0, max=13800)))

[post-reverb] Number of days: 792
[post-reverb] Titles per day: OrderedDict([('2006-10-20', 0), ('2006-10-23', 0), ('2006-10-24', 0), ('2006-10-25', 0), ('2006-10-27', 1), ('2006-10-30', 0), ('2006-11-01', 1), ('2006-11-02', 0), ('2006-11-03', 0), ('2006-11-07', 0), ('2006-11-08', 1), ('2006-11-09', 2), ('2006-11-10', 0), ('2006-11-13', 0), ('2006-11-14', 0), ('2006-11-15', 0), ('2006-11-16', 1), ('2006-11-17', 0), ('2006-11-20', 0), ('2006-11-21', 0), ('2006-11-22', 2), ('2006-11-24', 0), ('2006-11-27', 0), ('2006-11-29', 1), ('2006-11-30', 0), ('2006-12-01', 0), ('2006-12-04', 0), ('2006-12-05', 0), ('2006-12-06', 0), ('2006-12-07', 0), ('2006-12-08', 0), ('2006-12-11', 2), ('2006-12-12', 1), ('2006-12-13', 0), ('2006-12-14', 1), ('2006-12-15', 0), ('2006-12-18', 1), ('2006-12-19', 1), ('2006-12-20', 0), ('2006-12-21', 0), ('2006-12-26', 0), ('2006-12-28', 0), ('2007-01-03', 0), ('2007-01-04', 0), ('2007-01-08', 0), ('2007-01-09', 0), ('2007-01-10', 2), ('2007-01-11', 2), ('2007-01-1

In [7]:
# Normalize date field
date_secs = [[utcdate2timestamp(df_row[0])] for df_row in reverb_df_list]

scaler = MinMaxScaler(feature_range=(0, 1))
date_secs = scaler.fit_transform(date_secs)

normalized_df_list = []
for df_row, normalized_date in zip(reverb_df_list, date_secs):
    normalized_df_list.append([normalized_date[0], df_row[1], df_row[2]])

# print(normalized_df_list)

In [8]:
# 真正要用来训练的资料
# 提取了三元组,并组合了对应的股票价格和时间
reverb_df = pd.DataFrame(normalized_df_list, columns=["ari_time", "ari_title", "price"])
print("Shape:", reverb_df.shape)
reverb_df.to_pickle('train_reduce.pkl')

Shape: (792, 3)


In [9]:
# [Checking] 输出训练资料查验
reverb_pkl_df = pd.read_pickle('train_reduce.pkl')
print("Shape: ", reverb_pkl_df.shape)
print( reverb_pkl_df[:20])

Shape:  (792, 3)
    ari_time                                          ari_title  price
0   0.000000                                                         1
1   0.001181                                                         1
2   0.001574                                                         1
3   0.001968                                                         1
4   0.002755                            balda say investor asks      0
5   0.003935                                                         1
6   0.004723                                       ace lift pay      1
7   0.005116                                                         0
8   0.005510                                                         0
9   0.007084                                                         1
10  0.007477                     namibian investors buy absa 's      0
11  0.007871  firstrand say law may pare fees china stocks r...      1
12  0.008264                                                