In [1]:
#!/usr/bin/python3
import os
import pandas as pd
import requests
from datetime import datetime
import json
import re
import subprocess

In [2]:
# 用于显示 for 循环执行进度的 function 
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [3]:
%time
pickle_df = pd.read_pickle('train.pkl')

print(pickle_df.shape, type(pickle_df))
print(pickle_df[15:20])

CPU times: user 5 µs, sys: 2 µs, total: 7 µs
Wall time: 12.2 µs
(55483, 3) <class 'pandas.core.frame.DataFrame'>
                ari_time                                          ari_title  \
15  2006-10-30T22:48:25Z   Amerigroup Defrauded U.S., Must Pay $144 Million   
16  2006-11-01T13:45:11Z  Comstar to Spend $300 Million on Regional Purc...   
17  2006-11-01T16:28:21Z  Russian Stocks Gain the Most in Three Weeks; L...   
18  2006-11-01T21:19:28Z  ACE Must Lift Pay for Air Canada Pilots, Jazz ...   
19  2006-11-02T05:27:24Z  Coles Myer Raises Performance Targets for Bonuses   

    price  
15      1  
16      0  
17      0  
18      0  
19      0  


In [4]:
# 读取之前生成的news资料里的标题
# 并加上句号以及分割成一行一句格式
# 以便 reverb 工具来读取

file_path = "./reverb_pre.txt"
fo = open(file_path, "w+")
print ("文件名: ", fo.name)

for ari_title in pickle_df[:-1]["ari_title"]:
    line = fo.write( ari_title + ".\n" )
fo.close()

# 读取切割后方便reverb使用的文件所有内容
index = 0
file_content = ""
with open(file_path, 'r') as f:
    file_content = f.read()

pre_content_list = file_content.split("\n")
print("File has", len(pre_content_list), "terms.\n")

# Print the last 5 rows for checking
print(pre_content_list[:5])

文件名:  ./reverb_pre.txt
File has 55483 terms.

["Inco's Net Soars on Higher Metal Prices, Breakup Fee.", 'EU Energy Chief Backs German Plan for Price Controls.', 'Ex-Plant Worker Shuster Pleads Guilty in Trading Case.', 'Russia, Ukraine End Dispute That Cut Gas Supplies.', 'Jim Cramer: Bare Escentuals, Allergan, Medicis, Avon.']


In [5]:
# Call Reverb for OpenIE parsing
# Run command below before continuing on the next step:
#   java -Xmx512m -jar ./test-reverb/reverb.jar --quiet reverb_pre.txt > reverb_result.txt

In [None]:
%%time
#  CPU times: user 3min 56s

# reverb tool 生成的结果文件,命令行生成的,带有额外杂讯
result_file_name = "reverb_result.txt"
file_content = ""
with open(result_file_name, 'r') as out:
    file_content = out.read()
res_content_list = file_content.split("\n")

# Remove last line (empty string)
del res_content_list[-1]

# 提取 reverb 结果文件中需要的三元组信息
src_line_numbers = []
reverb_res_list = []
for fc in range(0, len(res_content_list)):
    line_tokens = res_content_list[fc].split("\t")
    src_line_numbers.append(int(line_tokens[1]) - 1)
    reverb_res_list.append(line_tokens[-3:])
# print(len(reverb_res_list), "\n", reverb_res_list[:1], "\n" )

# 因为 reverb 并没有把所有news title 全部生成三元组格式
# 有一些遗漏了.所以需要检索出 reverb 结果对应的时间和股票 price
# 把 reverb 生成结果 与原来的news data 检索比对
reverb_df_list = []
i = 0
for terms in log_progress(reverb_res_list):
    terms = reverb_res_list[i]
    src_line_number = src_line_numbers[i]
    
    # Extract date
    date_t = pickle_df["ari_time"][src_line_number][0:10]
    # Join terms as a string
    ari_title_t = ' '.join(terms)
    price_t = pickle_df["price"][src_line_number]
    
    # Print OpenIE extraction result for checking
    # line_lower = pre_content_list[src_line_number].lower()
    # print(i, date_t, line_lower, str(terms), price_t, "\n")
    
    reverb_df_list.append([date_t, ari_title_t, price_t])
    i += 1

In [18]:
# 真正要用来训练的资料
# 提取了三元组,并组合了对应的股票价格和时间
reverb_df = pd.DataFrame(reverb_df_list, columns=["ari_time", "ari_title", "price"])
print("Shape:", reverb_df.shape)
reverb_df.to_pickle('train_reduce.pkl')
reverb_pkl_df = pd.read_pickle('train_reduce.pkl')
print("Shape: ", reverb_pkl_df.shape)

Shape: (13921, 3)
Shape:  (13921, 3)


In [19]:
# 输出训练资料查验
print( reverb_pkl_df[:3])

     ari_time                       ari_title  price
0  2006-10-27         balda say investor asks      0
1  2006-11-01                    ace lift pay      0
2  2006-11-08  namibian investors buy absa 's      1
