# Initialization

In [1]:
import os
import shutil
import re
import json
import pandas as pd
import datetime as dt
import numpy as np
import operator
import math
import multiprocessing
import time

from itertools import islice

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline

## Set Flags

In [2]:
flag_rerun_filter_by_terms = True
flag_clone = True
flag_remove_old_files = True
flag_copy_used_reports = False

## Set experiment dates

In [3]:
date_start = dt.datetime.strptime('2005-01-01', '%Y-%m-%d')
date_end = dt.datetime.strptime('2021-06-30', '%Y-%m-%d') 

In [4]:
date_extension_start = dt.datetime.strptime('2018-01-01', '%Y-%m-%d')
date_extension_end = dt.datetime.strptime('2021-06-30', '%Y-%m-%d') 

### Set time series

In [5]:
year_series = list(range(date_start.year, date_end.year)) 
year_extension_series = list(range(date_extension_start.year, date_extension_end.year))

In [6]:
print(year_series)
print(year_extension_series)

[2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
[2018, 2019, 2020]


## Set paths

### Set root directories

In [7]:
dir_root = os.path.join('..')
dir_data_raw = os.path.join(dir_root, "data_raw")
dir_data_processing = os.path.join(dir_root, "data_processing")
dir_data_runs = os.path.join(dir_root, "data_runs")
dir_ticker_prices_source = os.path.join(dir_data_raw, "prices", "ready")
dir_ticker_prices_destination = os.path.join(dir_data_processing, "prices")

### Set reports directories

In [8]:
dir_reports_txt = os.path.join(dir_data_raw, 'reports_txt')
dir_reports_words = os.path.join(dir_data_processing, 'reports_words')
dir_reports_terms = os.path.join(dir_data_processing, 'reports_terms')
dir_reports_grams = os.path.join(dir_data_processing, 'reports_gramms')
dir_reports_ready =  os.path.join(dir_data_processing, 'reports_ready')
dir_extention_reports = os.path.join(dir_data_processing, "reports_ready_extentsion")

### Set return files 

In [9]:
file_return_table = os.path.join(dir_ticker_prices_destination, "all-returns.csv")
file_indices_table = os.path.join(dir_ticker_prices_destination, "all-industries-indices.csv")

# Setup the base run

In [10]:
run_to_extend_prefix = 'run_19_22'

In [11]:
run_prefix = "run_19-ext_22"

In [12]:
dir_run_to_extend = os.path.join(dir_data_runs, run_to_extend_prefix)

In [13]:
dir_run = os.path.join(dir_data_runs, run_prefix)

## Read tickers in the run

In [14]:
tickers = set()

with open(os.path.join(dir_run_to_extend, run_to_extend_prefix + '-documents.dat'), 'r') as f_r:
    for line in f_r:
        ticker = line.strip().split('-')[0]
        tickers.add(ticker)
        
print('Read %s tickers' % len(tickers))

for t in sorted(tickers):
    print(t)

Read 79 tickers
AAL_FTSE
ACA_CAC
AC_CAC
ADM_FTSE
AGK_FTSE
AIR_CAC
ALV_DAX
ANTO_FTSE
BAB_FTSE
BARC_FTSE
BAYN_DAX
BA_FTSE
BMW_DAX
BNP_CAC
CAP_CAC
CBK_DAX
CSCO_DJIA
CS_CAC
CVX_DJIA
DAI_DAX
DBK_DAX
DIS_DJIA
DPW_DAX
DTE_DAX
EDF_CAC
EI_CAC
EOAN_DAX
FRE_DAX
GS_DJIA
HD_DJIA
HEN3_DAX
HSBC_DJIA
IFX_DAX
JPM_DJIA
KO_DJIA
LHA_DAX
LIN_DAX
LLOY_FTSE
LR_CAC
MCD_DJIA
MC_CAC
MKS_FTSE
MRO_FTSE
MRW_FTSE
MUV2_DAX
OR_CAC
PFC_FTSE
PFE_DJIA
PG_DJIA
PRU_FTSE
RBS_FTSE
RB_FTSE
REL_FTSE
RI_CAC
RRS_FTSE
RSA_FTSE
RWE_DAX
SAP_DAX
SBRY_FTSE
SDF_DAX
SHP_FTSE
SIE_DAX
SKY_FTSE
SLA_FTSE
SNN_FTSE
SRP_FTSE
SSE_FTSE
TKA_DAX
TSCO_FTSE
TVE_DJIA
ULVR_FTSE
UTX_DJIA
VED_FTSE
VOD_FTSE
VZ_DJIA
WEIR_FTSE
WMT_DJIA
WPP_FTSE
WTB_FTSE


## Read terms from run

In [15]:
file_terms = os.path.join(dir_run_to_extend, '%s-terms.dat' % run_to_extend_prefix)

In [16]:
dict_term2id = {}
dict_id2term = {}
terms_set = set()
term_list = []
id_counter = 0

with open(file_terms, 'r') as f_r:
    for text_line in f_r:
        term = text_line.strip()
        terms_set.add(term)
        term_list.append(term)
        dict_term2id[term] = id_counter
        dict_id2term[id_counter] = term
        id_counter += 1

In [17]:
print('Size of dictionary: %s ' % len(terms_set))

Size of dictionary: 21719 


## Filter reports by terms and store in extension folder

Create extension reports folder if doesn't exist

In [18]:
if not os.path.exists(dir_extention_reports):
    os.makedirs(dir_extention_reports)

In [19]:
def filter_report_by_terms(report_file_name, report_dir, new_report_dir, dictionary):
    report_path = os.path.join(report_dir, report_file_name)
    with open(report_path, 'r', encoding='utf-8') as f_r:
        new_terms = []
        for text_line in f_r:
            terms = text_line.strip().split(' ')
            for term in terms:
                if term in dictionary:
                    new_terms.append(term)
                
                
    new_report_path = os.path.join(new_report_dir, report_file_name)
    with open(new_report_path, 'w') as f_w:
        f_w.write("%s" % ' '.join(new_terms))
        

In [20]:
def filter_ready_reports():
    for report_file_name in os.listdir(dir_reports_ready):
        if report_file_name == ".DS_Store":
            continue
        filter_report_by_terms(report_file_name, 
                              dir_reports_ready,
                              dir_extention_reports,
                              terms_set)    

In [21]:
if flag_rerun_filter_by_terms:
    filter_ready_reports()

# Clone run to extend

## Init

In [22]:
if flag_clone:
    if os.path.exists(dir_run):
        shutil.rmtree(dir_run)
        
    shutil.copytree(dir_run_to_extend, dir_run)

# Build run data

## Read reports in extension period

In [23]:
year_to_pull = set(year_series)
reports_extension = {}
amount_reports = 0

for year in year_series:
    reports_extension[year] = []

for file_name in os.listdir(dir_extention_reports):
    if file_name == '.DS_Store':
        continue
    
    ticker = file_name.split('-')[0]
    if ticker not in tickers:
        continue
    
    file_year = int(file_name.split('-')[1].split('_')[0])
    if file_year not in year_to_pull:
        continue
        
    reports_extension[file_year].append(file_name)
    amount_reports += 1
    
for year in year_extension_series:
    reports_extension[year] = sorted(reports_extension[year])

In [24]:
print('We have %s new reports' % amount_reports)

We have 5562 new reports


## Prepare data for run

### Get amount of document in every year

##### Restore amount of documents in every year

In [25]:
amount_documents_in_series_dict = dict()

for year in year_series:
    amount_documents_in_series_dict[year] = 0
    
with open(os.path.join(dir_run, run_to_extend_prefix + '-seq.dat'), 'r') as f_r:
    for i, line in enumerate(islice(f_r, 1, None)):
        value = int(line.strip())
        amount_documents_in_series_dict[date_start.year + i] = value

for key, value in amount_documents_in_series_dict.items():
    print('year: %s amount:%s' %(key, value))

year: 2005 amount:234
year: 2006 amount:268
year: 2007 amount:274
year: 2008 amount:285
year: 2009 amount:303
year: 2010 amount:302
year: 2011 amount:304
year: 2012 amount:301
year: 2013 amount:320
year: 2014 amount:335
year: 2015 amount:382
year: 2016 amount:363
year: 2017 amount:368
year: 2018 amount:0
year: 2019 amount:0
year: 2020 amount:0


#### Update existing data and extend with new reports

In [26]:
for year in year_series:
    amount_documents_in_series_dict[year] = len(reports_extension[year])
    
for key, value in amount_documents_in_series_dict.items():
    print('year: %s amount:%s' %(key, value))

year: 2005 amount:235
year: 2006 amount:273
year: 2007 amount:279
year: 2008 amount:288
year: 2009 amount:304
year: 2010 amount:303
year: 2011 amount:306
year: 2012 amount:302
year: 2013 amount:318
year: 2014 amount:353
year: 2015 amount:438
year: 2016 amount:428
year: 2017 amount:436
year: 2018 amount:474
year: 2019 amount:412
year: 2020 amount:413


Remove old file

In [27]:
if flag_remove_old_files:
    os.remove(os.path.join(dir_run, run_to_extend_prefix + '-seq.dat'))

### Get list of reports names

#### Restore reports names and extend

In [28]:
documents_name_list = list()

#with open(os.path.join(dir_run, run_to_extend_prefix + '-documents.dat'), 'r') as f_r:
#    for line in f_r:
#        documents_name_list.append(line.strip())
#print('Read %s reports' % len(documents_name_list))

for year in year_series:
    # the following line assumes that reports_extension[year] is sorted see a cell where we construct it
    documents_name_list = documents_name_list + reports_extension[year]
print('Combined %s reports' % len(documents_name_list))

Combined 5562 reports


Remove old file

In [29]:
if flag_remove_old_files:
    os.remove(os.path.join(dir_run, run_to_extend_prefix + '-documents.dat'))

### Get vectorize documents

#### Vectorization function

In [30]:
def vectorize_report_dtm(file_report):
    vector_report = list()
    document_bow = dict()
    with open(file_report, 'r') as f_r:
        for text_line in f_r:
            terms = text_line.strip().split(' ')
            for term in terms:
                term_id = dict_term2id[term]
                if term_id not in document_bow:
                    document_bow[term_id] = 0
                document_bow[term_id] += 1
            
    for term_id, term_counter in document_bow.items():
        vector_report.append("%s:%s" % (term_id, term_counter))
    
    return vector_report

#### Restore vecorized docs and extend with new reports in the same order

In [31]:
documents_vector_list = list()
 
#with open(os.path.join(dir_run, run_to_extend_prefix + '-mult.dat'), 'r') as f_r:
#    for line in f_r:
#        document = list(islice(line.strip().split(' '), 1, None))
#        documents_vector_list.append(document)
#print('Read %s reports' % len(documents_vector_list))

for year in year_series:
    for report in reports_extension[year]:
        report_path = os.path.join(dir_extention_reports, report)
        document = vectorize_report_dtm(report_path)
        documents_vector_list.append(document)
print('Combined %s reports' % len(documents_vector_list))

Combined 5562 reports


Remove old file

In [32]:
if flag_remove_old_files:
    os.remove(os.path.join(dir_run, run_to_extend_prefix + '-mult.dat'))

### Generate run files

#### Serialize for DTM

save prefix-seq.dat

In [33]:
with open(os.path.join(dir_run, run_prefix + '-seq.dat'), 'w') as f_w:
    f_w.write("%s\n" % len(year_series))
    for year in sorted(amount_documents_in_series_dict.keys()):
        f_w.write("%s\n" % amount_documents_in_series_dict[year])

save prefix-mult.dat

In [34]:
with open(os.path.join(dir_run, run_prefix + '-mult.dat'), 'w') as f_w:
    for document in documents_vector_list:
        f_w.write("%s %s\n" % (len(document), ' '.join(document)))

#### Serialize data for interpretation

save prefix-documents.dat, every document the same order with mult.dat

In [35]:
with open(os.path.join(dir_run, run_prefix + '-documents.dat'), 'w') as f_w:
    for document in documents_name_list:
        f_w.write("%s\n" % document)

save prefix-terms.dat, with terms in ID order 

In [36]:
with open(os.path.join(dir_run, run_prefix + '-terms.dat'), 'w') as f_w:
    for term in term_list:
        f_w.write("%s\n" % term)

Copy returns to the run folder

In [37]:
shutil.copyfile(file_return_table, os.path.join(dir_run, run_prefix + '-returns.csv'))

'../data_runs/run_19-ext_22/run_19-ext_22-returns.csv'

Copy industry indices returns to the run folder

In [38]:
shutil.copyfile(file_indices_table, os.path.join(dir_run, run_prefix + '-industry-returns.csv'))

'../data_runs/run_19-ext_22/run_19-ext_22-industry-returns.csv'

remove old files 

In [39]:
def silent_remove(filename):
    if os.path.exists(filename): os.remove(filename)

In [40]:
if flag_remove_old_files:
    silent_remove(os.path.join(dir_run, run_to_extend_prefix + '-terms.dat'))
    silent_remove(os.path.join(dir_run, run_to_extend_prefix + '-returns.csv'))
    silent_remove(os.path.join(dir_run, run_to_extend_prefix + '-industry-returns.csv'))

#### Serialize run settings

Rename run setting file

In [41]:
old_setting_file = os.path.join(dir_run, run_to_extend_prefix + '-preprocesssing_settings.dat')
new_setting_file = os.path.join(dir_run, run_prefix + '-preprocesssing_settings.dat')
os.rename(old_setting_file, new_setting_file)

### Create result directories

In [42]:
dir_results = os.path.join(dir_run, 'results')

if not os.path.exists(dir_results):
    os.makedirs(dir_results)

In [43]:
dir_interpretation = os.path.join(dir_run, 'interpretation')

if os.path.exists(dir_interpretation):
    shutil.rmtree(dir_interpretation, ignore_errors=True)

os.makedirs(dir_interpretation)

### Copy reports

Copy reports for the run folder only if flag_copy_used_reports set True

In [44]:
dir_run_reports = os.path.join(dir_run, 'reports')

In [45]:
if flag_copy_used_reports:
    if not os.path.exists(dir_run_reports):
        os.makedirs(dir_run_reports)    
    for report_file_name in os.listdir(dir_reports_ready):
        path_report_src = os.path.join(dir_reports_ready, report_file_name)
        path_report_dst = os.path.join(dir_run_reports, report_file_name)

        if report_file_name != '.DS_Store' and os.path.isfile(path_report_src):
            shutil.copyfile(path_report_src, path_report_dst)

# EnD