# Initialization

In [1]:
import os
import shutil
import re
import json
import pandas as pd
import datetime as dt
import numpy as np
import operator
import math
import multiprocessing

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline

### set experiment dates

In [2]:
date_start = dt.datetime.strptime("2005-01-01", "%Y-%m-%d")
date_end = dt.datetime.strptime("2018-06-30", "%Y-%m-%d") 

### set root directories

In [3]:
dir_root = os.path.join('..')
dir_data_raw = os.path.join(dir_root, "data_raw")
dir_data_processing = os.path.join(dir_root, "data_processing")
dir_data_runs = os.path.join(dir_root, "data_runs")
dir_prices = os.path.join(dir_data_processing, "prices")

### Set reports directories

In [4]:
dir_reports_txt = os.path.join(dir_data_raw, "reports_txt")
dir_reports_words = os.path.join(dir_data_processing, "reports_words")
dir_reports_terms = os.path.join(dir_data_processing, "reports_terms")
dir_reports_grams = os.path.join(dir_data_processing, "reports_gramms")
dir_reports_ready =  os.path.join(dir_data_processing, "reports_ready")

### Set terms directories

In [5]:
dir_terms_eliminated = os.path.join(dir_data_processing, "terms_elemenated")
dir_terms_counts = os.path.join(dir_data_processing, "terms_counts")

### set report name RegExt

In [6]:
check_report_name_reg_exp = "(?P<ticker>[A-Z1-9]+)[_-](?P<type>[A-Z]+)(?P<number>[1-9]*)(?P<subnumber>-[1-9]+)?[-_](?P<year>[0-9]{4})[_-](?P<p_year>[0-9]{4})"

### set flags

In [7]:
flag_debug = False
flag_terms_filter_debug = False

flag_extend_stopwords = True
flag_test_report_names = True
flag_filtering_with_bigramms = True

flag_rerun_text_2_words = False
flag_rerun_words_2_terms = True
flag_rerun_terms_2_gramms = True
flag_rerun_filter_terms = True ### keep it True, it generates the last reports ready data

### set time series

In [8]:
year_series = list(range(date_start.year, date_end.year)) 

In [9]:
if flag_debug:
    print(year_series)

# Test reports names

In [10]:
def test_reports_names(dir_findex, ticker):
    dir_ticker = os.path.join(dir_findex, ticker)
    if os.path.isdir(dir_ticker):
        ticker_documents_amount = 0
        ticker_code = "%s_%s" % (ticker, findex)
        for report_file_name in os.listdir(dir_ticker):
            if report_file_name == ".DS_Store":
                continue

            match = re.search(check_report_name_reg_exp, report_file_name)
            if not match:
                print("working on %s, filename %s doesn't fit pattern" % (ticker_code, report_file_name))

In [11]:
if flag_test_report_names:
    for findex in os.listdir(dir_reports_txt):
        dir_findex = os.path.join(dir_reports_txt, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                if flag_test_report_names: 
                    pool.starmap(test_reports_names, [(dir_findex, ticker) for ticker in os.listdir(dir_findex)])


## Posseble Reports completeness

We want to find all tickers of companies which have reports for the experiment's timeframe.

In [12]:
file_tickers_years = os.path.join(dir_data_processing, "tickers", "possible_tickers_years.datajson")

Iterate over all reprots and collect years of publishing for every company

In [13]:
tickers = dict()
for findex in os.listdir(dir_reports_txt):
    dir_findex = os.path.join(dir_reports_txt, findex)
    if os.path.isdir(dir_findex):
        for ticker in os.listdir(dir_findex):
            dir_ticker = os.path.join(dir_findex, ticker)
            if os.path.isdir(dir_ticker):
                ticker_years_set = set()
                for report_file_name in os.listdir(dir_ticker):
                    if report_file_name == ".DS_Store":
                        continue
                    match = re.search(check_report_name_reg_exp, report_file_name)
                    ticker_years_set.add(int(match.group("p_year")))
                tickers["%s_%s" % (ticker, findex)] = sorted(ticker_years_set)

save years of companies

In [14]:
with open(file_tickers_years, "w") as f_w:
    for ticker in tickers:
        f_w.write("%s\n" % json.dumps({"ticker": ticker, "available_years": tickers[ticker]}))
        

Find tickers with reports for every year in the experiment timeframe

In [15]:
possible_tickers_for_analysis = set()
tickers_all = 0
for ticker, available_years in tickers.items():
    flag_complete_series = True
    available_years_set = set(available_years)
    for year in year_series:
        if year not in available_years_set:
            flag_complete_series = False
            break
    if flag_complete_series and os.path.exists(os.path.join(dir_prices, "%s.csv" % ticker)):
        possible_tickers_for_analysis.add(ticker)

In [16]:
print("there are %s tickers possible for the experiment" % len(possible_tickers_for_analysis))
for ticker in sorted(possible_tickers_for_analysis):
    print(ticker)

there are 79 tickers possible for the experiment
AAL_FTSE
ACA_CAC
AC_CAC
ADM_FTSE
AGK_FTSE
AIR_CAC
ALV_DAX
ANTO_FTSE
BAB_FTSE
BARC_FTSE
BAYN_DAX
BA_FTSE
BMW_DAX
BNP_CAC
CAP_CAC
CBK_DAX
CSCO_DJIA
CS_CAC
CVX_DJIA
DAI_DAX
DBK_DAX
DIS_DJIA
DPW_DAX
DTE_DAX
EDF_CAC
EI_CAC
EOAN_DAX
FRE_DAX
GS_DJIA
HD_DJIA
HEN3_DAX
HSBC_DJIA
IFX_DAX
JPM_DJIA
KO_DJIA
LHA_DAX
LIN_DAX
LLOY_FTSE
LR_CAC
MCD_DJIA
MC_CAC
MKS_FTSE
MRO_FTSE
MRW_FTSE
MUV2_DAX
OR_CAC
PFC_FTSE
PFE_DJIA
PG_DJIA
PRU_FTSE
RBS_FTSE
RB_FTSE
REL_FTSE
RI_CAC
RRS_FTSE
RSA_FTSE
RWE_DAX
SAP_DAX
SBRY_FTSE
SDF_DAX
SHP_FTSE
SIE_DAX
SKY_FTSE
SLA_FTSE
SNN_FTSE
SRP_FTSE
SSE_FTSE
TKA_DAX
TSCO_FTSE
TVE_DJIA
ULVR_FTSE
UTX_DJIA
VED_FTSE
VOD_FTSE
VZ_DJIA
WEIR_FTSE
WMT_DJIA
WPP_FTSE
WTB_FTSE


# Reports pre-processing

## Initial clean up

First remove all but English letters and re-save reports as a sequence of lower case words consist only from letters a-z

In [17]:
regexp_to_remove = re.compile(r"[\dâºâãï½ã\_]")
regexp_to_keep = re.compile(r"[^a-z\s]")

In [18]:
def convert_raw_text_2_words(file_report_path):
    words = []
    with open(os.path.join(file_report_path), "r") as f_r:
        for text_line in f_r:
            cleaned_text = re.sub(regexp_to_keep, " ", text_line.lower())
            words_in_line = re.split("\W+", cleaned_text)
            for possible_word in words_in_line:
                word = possible_word.strip()
                if len(word) > 1:
                    words.append(word)
    return words

In [19]:
def reports_2_words_processing(dir_findex, ticker, findex, years_set):
    ticker_code = "%s_%s" % (ticker, findex)
    
    if ticker_code not in possible_tickers_for_analysis:
        print("Skip %s" % ticker_code)
        return
    
    dir_ticker = os.path.join(dir_findex, ticker)
    good_documents_amount = 0
    empty_documents_amount = 0
    bad_years = set()
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            if report_file_name == ".DS_Store":
                continue
            match = re.search(check_report_name_reg_exp, report_file_name)
            if not match:
                print("filename %s doesn't fit pattern" % report_file_name)
            else:
                year = int(match.group("p_year"))
                if year not in years_set:
                    bad_years.add(year)
                    continue
                list_words = convert_raw_text_2_words(os.path.join(dir_ticker, report_file_name))
                if len(list_words):
                    good_documents_amount += 1
                    new_file_name = "%s_%s.txt" % (year, good_documents_amount)
                    new_path = os.path.join(dir_reports_words, findex, ticker)
                    if not os.path.exists(new_path):
                        os.makedirs(new_path)
                    with open(os.path.join(new_path, new_file_name), "w") as f_w:
                        f_w.write("%s" % ' '.join(list_words))
                else: 
                    empty_documents_amount += 1
                    if flag_debug:
                        print("report %s is empty after cleaning" % report_file_name)
        print("Done on %s, reports: %s, empty: %s, bad years: [%s]" % 
              (ticker_code, good_documents_amount, empty_documents_amount, ", ".join(map(str, sorted(bad_years)))))                    
                    

Run previously defined function words_2_terms_processing in pool of 4 processes to speedup the cleaning, The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

**please make sure that flag_rerun_text_2_words is set to True if you want to run/re-run this preprosessing step**

In [20]:
if flag_rerun_text_2_words:
    filtering_years_set = set(year_series)
    for findex in os.listdir(dir_reports_txt):
        dir_findex = os.path.join(dir_reports_txt, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                pool.starmap(reports_2_words_processing, [(dir_findex, ticker, findex, filtering_years_set) for ticker in os.listdir(dir_findex)])
                

## Lematization and english words filter

In [21]:
import nltk
from nltk.corpus import brown

Make sure that all nltk data sets are available

In [22]:
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("words")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Create stop words set and white words set

#### Create stop words set

In [23]:
stop_words = nltk.corpus.stopwords.words('english')
english_words = set(nltk.corpus.words.words())
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
stop_words_set = set(stop_words)

Extend stop words with custom stop words

In [24]:
if flag_extend_stopwords:
    with open(os.path.join(dir_data_raw, "english", "extra_stopwords.txt"), "r") as f_r:
        for text_line in f_r:
            term = text_line.strip()
            stop_words_set.add(term)

#### Create a white list of words

In [25]:
white_list_set = set()
with open(os.path.join(dir_data_raw, "english", "white_stopwords.txt"), "r") as f_r:
    for text_line in f_r:
        term = text_line.strip()
        white_list_set.add(term)

#### Filter functions

### Define a funtion checker for stop words. A word is stop word if any of the folowing true:
- it's length shorter then 3 char
- it contains digits
- it appears in nltk stop words set

In [26]:
def is_stop_term(term):
    if term in white_list_set:
        return False
    if len(term) < 3:
        return True
    return term in stop_words_set

Define a funtion lemmatizer

In [27]:
import spacy

Make sure that en spacy data set is available

In [28]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [29]:
nlp = spacy.load('en_core_web_sm')#, disable=['parser', 'ner'])
#lemmatizer = WordNetLemmatizer().lemmatize
#stemmer = SnowballStemmer("english").stem

In [30]:
tokens = nlp('bank banking go going')
for token in tokens:
    print(token.lemma_ + ' ' + token.text)

bank bank
banking banking
go go
go going


#### Processing functions

In [31]:
def tokenize(text_line, terms, eliminated_terms):
    doc = nlp(text_line)
    for token in doc:
        term = token.lemma_ if token.lemma_ != "-PRON-" else token.text
        if is_stop_term(term) or term not in english_words:
            if term not in eliminated_terms:
                eliminated_terms[term] = 0
            eliminated_terms[term] += 1  
        else:
            terms.append(term)  

In [32]:
def convert_words_2_terms(file_report_path, eliminated_terms):
    terms = []
    chunk_size = 30
    with open(os.path.join(file_report_path), "r",  encoding="utf-8") as f_r:
        for text_line in f_r:
            words_in_line = re.split("\W+", text_line)
            size = len(words_in_line)
            steps = int(size / chunk_size)
            for i in range(steps):
                tokenize(" ".join(words_in_line[i*chunk_size:(i+1)*chunk_size]), terms, eliminated_terms)
            tokenize(" ".join(words_in_line[steps*chunk_size:]), terms, eliminated_terms)    
                                 
    return terms

In [33]:
def words_2_terms_processing(dir_findex, ticker):
    dict_eliminated_terms = {}
    dir_ticker = os.path.join(dir_findex, ticker)
    good_documents_amount = 0
    empty_documents_amount = 0
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            if report_file_name == ".DS_Store":
                continue
            terms_list = convert_words_2_terms(os.path.join(dir_ticker, report_file_name), dict_eliminated_terms)
            if len(terms_list):   
                good_documents_amount += 1
                new_path = os.path.join(dir_reports_terms, findex, ticker)
                if not os.path.exists(new_path):
                    os.makedirs(new_path)
                with open(os.path.join(new_path, report_file_name), "w") as f_w:
                    f_w.write("%s" % ' '.join(terms_list))
            else: 
                empty_documents_amount += 1
                if flag_debug:
                    print("report %s is empty after cleaning" % report_file_name)
        
        if len(dict_eliminated_terms):
            if not os.path.exists(dir_terms_eliminated):
                os.makedirs(dir_terms_eliminated)
            with open(os.path.join(dir_terms_eliminated, "%s_%s.json" % (ticker, findex)), "w") as f_w:
                json.dump(dict_eliminated_terms, f_w)
        print("Done on %s, good reports: %s, empty reports: %s" % 
              (dir_ticker, good_documents_amount, empty_documents_amount))
                

Run previously defined function words_2_terms_processing in pool of 4 processes to speedup the cleaning, The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

**please make sure that flag_rerun_words_2_terms is set to True if you want to run/re-run this preprosessing step**

In [34]:
if flag_rerun_words_2_terms:
    for findex in os.listdir(dir_reports_words):
        dir_findex = os.path.join(dir_reports_words, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                pool.starmap(words_2_terms_processing, [(dir_findex, ticker) for ticker in os.listdir(dir_findex)])
                

Done on ../data_processing/reports_words/CAC/AC, good reports: 27, empty reports: 0
Done on ../data_processing/reports_words/CAC/EDF, good reports: 55, empty reports: 0
Done on ../data_processing/reports_words/CAC/RI, good reports: 39, empty reports: 0
Done on ../data_processing/reports_words/CAC/AIR, good reports: 70, empty reports: 0
Done on ../data_processing/reports_words/CAC/ACA, good reports: 53, empty reports: 0
Done on ../data_processing/reports_words/CAC/CAP, good reports: 39, empty reports: 0
Done on ../data_processing/reports_words/CAC/BNP, good reports: 27, empty reports: 0
Done on ../data_processing/reports_words/CAC/MC, good reports: 95, empty reports: 0
Done on ../data_processing/reports_words/CAC/CS, good reports: 53, empty reports: 0
Done on ../data_processing/reports_words/CAC/EI, good reports: 76, empty reports: 0
Done on ../data_processing/reports_words/CAC/LR, good reports: 78, empty reports: 0
Done on ../data_processing/reports_words/CAC/OR, good reports: 45, empt

## Summarize the eliminated words

Set flag_terms_filter_debug to True to print all the eliminated words

In [35]:
eliminated_words = {}
for file in os.listdir(dir_terms_eliminated):
    if file == '.DS_Store' or file == "all_elimintated_words":
        continue
    with open(os.path.join(dir_terms_eliminated, file), "r") as f_r:
        el = json.load(f_r)
        for word, count in el.items():
            if word not in eliminated_words:
                eliminated_words[word] = {"count": 0, "ticker": 0}
            eliminated_words[word]["count"] += count
            eliminated_words[word]["ticker"] += 1
        
print("list of eliminated terms, size(%s)" % len(eliminated_words))

with open(os.path.join(dir_terms_eliminated, "all_elimintated_words"), "w") as f_w:
    json.dump(eliminated_words, f_w)

if flag_terms_filter_debug:
    for word in sorted(eliminated_words.keys()):
        print("%s, usage - total: %s, tickers %s" % (word, eliminated_words[word]["count"], eliminated_words[word]["ticker"]))
              
              

list of eliminated terms, size(207278)


## Condence bigramms and trigarams

Read all documents as data: list of list

In [36]:
data_terms = []

In [37]:
def read_data_terms(dir_findex, ticker, data):
    dir_ticker = os.path.join(dir_findex, ticker)
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            file_path = os.path.join(dir_ticker, report_file_name)
            if not os.path.isfile(file_path):
                continue
            terms_list = []
            with open(file_path, 'r') as f_r:
                for text_line in f_r:
                    terms_list = terms_list + text_line.strip().split(" ")
            data.append(terms_list)

In [38]:
if flag_rerun_terms_2_gramms:
    for findex in os.listdir(dir_reports_terms):
        dir_findex = os.path.join(dir_reports_terms, findex)
        if os.path.isdir(dir_findex):
            for ticker in os.listdir(dir_findex):
                ticker_code = "%s_%s" % (ticker, findex)
                read_data_terms(dir_findex, ticker, data_terms) 
                print("%s data collected" % ticker_code)

EDF_CAC data collected
AC_CAC data collected
CS_CAC data collected
ACA_CAC data collected
RI_CAC data collected
AIR_CAC data collected
CAP_CAC data collected
MC_CAC data collected
BNP_CAC data collected
LR_CAC data collected
EI_CAC data collected
OR_CAC data collected
AAL_FTSE data collected
WEIR_FTSE data collected
BARC_FTSE data collected
MKS_FTSE data collected
WPP_FTSE data collected
PFC_FTSE data collected
SSE_FTSE data collected
SKY_FTSE data collected
ANTO_FTSE data collected
SHP_FTSE data collected
PRU_FTSE data collected
SNN_FTSE data collected
BAB_FTSE data collected
RSA_FTSE data collected
MRW_FTSE data collected
WTB_FTSE data collected
RBS_FTSE data collected
VOD_FTSE data collected
REL_FTSE data collected
AGK_FTSE data collected
VED_FTSE data collected
ADM_FTSE data collected
BA_FTSE data collected
RRS_FTSE data collected
LLOY_FTSE data collected
SLA_FTSE data collected
TSCO_FTSE data collected
MRO_FTSE data collected
ULVR_FTSE data collected
SRP_FTSE data collected
SBRY_F

In [39]:
import gensim

In [40]:
bigram = gensim.models.Phrases(data_terms, min_count=30, threshold=100) # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[data_terms], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

Re-read all docs and concatenate bi/trigramms

In [41]:
def term_2_gramms_processing(dir_findex, ticker):
    dir_ticker = os.path.join(dir_findex, ticker)
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            file_path = os.path.join(dir_ticker, report_file_name)
            if not os.path.isfile(file_path):
                continue
                
            terms_list = []
            with open(file_path, 'r') as f_r:
                for text_line in f_r:
                    terms_list = terms_list + text_line.strip().split(" ")
            terms_list = bigram_mod[terms_list]
            #terms_list = trigram_mod[bigram_mod[terms_list]]
            
            if len(terms_list):   
                new_path = os.path.join(dir_reports_grams, findex, ticker)
                if not os.path.exists(new_path):
                    os.makedirs(new_path)
                with open(os.path.join(new_path, report_file_name), "w") as f_w:
                    f_w.write("%s" % ' '.join(terms_list))
            else: 
                empty_documents_amount += 1
                if flag_debug:
                    print("report %s is empty after cleaning" % report_file_name)
        print("Done on %s" % dir_ticker)
        

Run previously defined function words_2_terms_processing in pool of 4 processes to speedup the cleaning, The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

**please make sure that flag_rerun_terms_2_gramms is set to True if you want to run/re-run this preprosessing step**

In [42]:
if flag_rerun_terms_2_gramms:
    if not os.path.exists(dir_reports_grams):
        os.makedirs(dir_reports_grams)
        
    for findex in os.listdir(dir_reports_terms):
        dir_findex = os.path.join(dir_reports_terms, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                pool.starmap(term_2_gramms_processing, [(
                    dir_findex,
                    ticker) for ticker in os.listdir(dir_findex)])

Done on ../data_processing/reports_terms/CAC/AC
Done on ../data_processing/reports_terms/CAC/EDF
Done on ../data_processing/reports_terms/CAC/RI
Done on ../data_processing/reports_terms/CAC/AIR
Done on ../data_processing/reports_terms/CAC/ACA
Done on ../data_processing/reports_terms/CAC/CAP
Done on ../data_processing/reports_terms/CAC/BNP
Done on ../data_processing/reports_terms/CAC/MC
Done on ../data_processing/reports_terms/CAC/CS
Done on ../data_processing/reports_terms/CAC/EI
Done on ../data_processing/reports_terms/CAC/LR
Done on ../data_processing/reports_terms/CAC/OR
Done on ../data_processing/reports_terms/FTSE/AAL
Done on ../data_processing/reports_terms/FTSE/WPP
Done on ../data_processing/reports_terms/FTSE/SSE
Done on ../data_processing/reports_terms/FTSE/WEIR
Done on ../data_processing/reports_terms/FTSE/PFC
Done on ../data_processing/reports_terms/FTSE/SKY
Done on ../data_processing/reports_terms/FTSE/ANTO
Done on ../data_processing/reports_terms/FTSE/BAB
Done on ../data_p

## Get corpus statistics 

In [43]:
def ticker_reports_term_counting(dir_findex, ticker, dict_terms_counts):
    number_of_repors = 0
    dir_ticker = os.path.join(dir_findex, ticker)
    terms_visited_ticker = set()
    if os.path.isdir(dir_ticker):
        ticker_documents_amount = 0
        for report_file_name in os.listdir(dir_ticker):
            file_path = os.path.join(dir_ticker, report_file_name)
            if os.path.isfile(file_path):
                terms_visited_document = set()
                number_of_repors += 1
                with open(file_path, 'r') as f_r:
                    for text in f_r:
                        for term in text.strip().split(' '):
                            if term not in dict_terms_counts:
                                dict_terms_counts[term] = {
                                    "count": 1, 
                                    "document": 1, 
                                    "tickers": 1}
                            else:
                                if term not in terms_visited_ticker:
                                    dict_terms_counts[term]["tickers"] += 1
                                if term not in terms_visited_document:
                                    dict_terms_counts[term]["document"] += 1
                                dict_terms_counts[term]["count"] += 1
                            terms_visited_ticker.add(term)
                            terms_visited_document.add(term)
    return number_of_repors

In [44]:
terms_counts = {} #key: {count: int, document: int, tickers: int}
number_of_documents = 0
number_of_tickers = 0

for findex in os.listdir(dir_reports_terms):
    dir_findex = os.path.join(dir_reports_terms, findex)
    if os.path.isdir(dir_findex):
        for ticker in os.listdir(dir_findex):
            number_of_tickers += 1
            ticker_code = "%s_%s" % (ticker, findex)
            number_of_documents += ticker_reports_term_counting(dir_findex, ticker, terms_counts) 
            print("%s terms stats collected" % ticker_code)


if not os.path.exists(dir_terms_counts):
    os.makedirs(dir_terms_counts)            
with open(os.path.join(dir_terms_counts, 'terms.json'), 'w') as f_w:
    json.dump(terms_counts, f_w)

EDF_CAC terms stats collected
AC_CAC terms stats collected
CS_CAC terms stats collected
ACA_CAC terms stats collected
RI_CAC terms stats collected
AIR_CAC terms stats collected
CAP_CAC terms stats collected
MC_CAC terms stats collected
BNP_CAC terms stats collected
LR_CAC terms stats collected
EI_CAC terms stats collected
OR_CAC terms stats collected
AAL_FTSE terms stats collected
WEIR_FTSE terms stats collected
BARC_FTSE terms stats collected
MKS_FTSE terms stats collected
WPP_FTSE terms stats collected
PFC_FTSE terms stats collected
SSE_FTSE terms stats collected
SKY_FTSE terms stats collected
ANTO_FTSE terms stats collected
SHP_FTSE terms stats collected
PRU_FTSE terms stats collected
SNN_FTSE terms stats collected
BAB_FTSE terms stats collected
RSA_FTSE terms stats collected
MRW_FTSE terms stats collected
WTB_FTSE terms stats collected
RBS_FTSE terms stats collected
VOD_FTSE terms stats collected
REL_FTSE terms stats collected
AGK_FTSE terms stats collected
VED_FTSE terms stats col

In [45]:
print("There're %s unique terms for topic analysis" % len(terms_counts))

There're 27918 unique terms for topic analysis


In [46]:
print("There're %s reports for topic analysis" % number_of_documents)

There're 4039 reports for topic analysis


In [47]:
print("There're %s tickers for topic analysis" % number_of_tickers)

There're 79 tickers for topic analysis


## Filtered term set by document frequency

### set terms limits

In [68]:
l1_min = 15 # min_number_of_doc
max_partition_of_doc = 0.8
l1_max = int(number_of_documents * max_partition_of_doc) #max_number_of_doc

In [69]:
print("l1_min: %s" % l1_min)
print("l1_max: %s" % l1_max)

l1_min: 15
l1_max: 3231


In [77]:
l2_min = 1 # min_number_of_tickers
max_partition_of_tickers = 0.98
l2_max = int(number_of_tickers * max_partition_of_tickers) #max_number_of_tickers

In [78]:
print("l2_min: %s" % l2_min)
print("l2_min: %s" % l2_max)

l2_min: 1
l2_min: 77


### build eliminated terms set

In [79]:
set_eliminated_by_l1_min = set()
set_eliminated_by_l1_max = set()
set_eliminated_by_l2_min = set()
set_eliminated_by_l2_max = set()
#terms_counts = {} #key: {count: int, document: int, tickers: int}

for term, stats in terms_counts.items():
    df = int(stats["document"] )
    tf = int(stats["tickers"] ) 
    
    if df < l1_min:
        set_eliminated_by_l1_min.add(term)
    elif df > l1_max:
        set_eliminated_by_l1_max.add(term)
    elif tf < l2_min:
        set_eliminated_by_l2_min.add(term)
    elif tf > l2_max:
        set_eliminated_by_l2_max.add(term)

print("eliminated by l1_min: %s" % len(set_eliminated_by_l1_min))
print("eliminated by l1_max: %s" % len(set_eliminated_by_l1_max))
print("eliminated by l2_min: %s" % len(set_eliminated_by_l2_min))
print("eliminated by l2_max: %s" % len(set_eliminated_by_l2_max))
print("eliminated %s" % (
    len(set_eliminated_by_l1_min) +
    len(set_eliminated_by_l2_min) +
    len(set_eliminated_by_l1_max) + 
    len(set_eliminated_by_l2_max)
    )
)



eliminated by l1_min: 14609
eliminated by l1_max: 264
eliminated by l2_min: 0
eliminated by l2_max: 1261
eliminated 16134


Set flag_terms_filter_debug to True to print eliminated words sets

In [82]:
if flag_terms_filter_debug:
    for term in sorted(set_eliminated_by_l1_min):
        print(term)


In [83]:
if flag_terms_filter_debug:    
    for term in sorted(set_eliminated_by_l1_max):
        print(term)

In [84]:
if flag_terms_filter_debug:
    for term in sorted(set_eliminated_by_l2_min):
        print(term)


In [85]:
if flag_terms_filter_debug:   
    for term in sorted(set_eliminated_by_l2_max):
        print(term)

#### Processing functions

In [86]:
def filter_terms(file_report_path):
    result = []
    with open(file_report_path, 'r',  encoding='utf-8') as f_r:
        for text_line in f_r:
            terms = re.split('\W+', text_line.strip())
            for term in terms:
                if (
                    term in set_eliminated_by_l1_min or 
                    term in set_eliminated_by_l1_max or 
                    term in set_eliminated_by_l2_min or 
                    term in set_eliminated_by_l2_max
                ):
                    continue    
                result.append(term)
    return result

In [87]:
def terms_filtering(dir_findex, ticker, findex):
    dict_eliminated_terms = {}
    dir_ticker = os.path.join(dir_findex, ticker)
    good_documents_amount = 0
    empty_documents_amount = 0
    ticker_code = '%s_%s' % (ticker, findex)
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            full_report_name = os.path.join(dir_ticker, report_file_name)
            if os.path.isfile(full_report_name) and report_file_name != '.DS_Store':
                terms_list = filter_terms(full_report_name)
                if len(terms_list):
                    good_documents_amount += 1
                    with open(
                        os.path.join(dir_reports_ready, '%s-%s' % (ticker_code, report_file_name)),
                        'w') as f_w:
                        f_w.write('%s' % ' '.join(terms_list))
                else: 
                    empty_documents_amount += 1
                    if flag_debug:
                        print('report %s is empty after cleaning' % report_file_name)
        print('Done on %s, good reports: %s, empty reports: %s' % 
              (dir_ticker, good_documents_amount, empty_documents_amount))

Run previously defined function terms_filtering in pool of 4 processes to speedup the cleaning, The following cell doesn't take much time, feel free to experiment with l_min and l_max

**please make sure that flag_rerun_filter_terms is set to True if you want to run/re-run this preprosessing step**

In [88]:
if flag_rerun_filter_terms:
    dir_source = dir_reports_grams if flag_filtering_with_bigramms else dir_reports_terms
    
    if not os.path.exists(dir_reports_ready):
        os.makedirs(dir_reports_ready)
        
    for findex in os.listdir(dir_source):
        dir_findex = os.path.join(dir_source, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                pool.starmap(terms_filtering, [(
                    dir_findex,
                    ticker,
                    findex) for ticker in os.listdir(dir_findex)])
                

Done on ../data_processing/reports_gramms/CAC/AC, good reports: 27, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/EDF, good reports: 55, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/RI, good reports: 39, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/AIR, good reports: 70, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/ACA, good reports: 53, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/CAP, good reports: 39, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/BNP, good reports: 27, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/CS, good reports: 53, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/MC, good reports: 95, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/EI, good reports: 76, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/LR, good reports: 78, empty reports: 0
Done on ../data_processing/reports_gramms/CAC/OR, good repor

# Get tickers for analisys

## Reports completeness

We want to find all tickers of companies which have reports for the experiment's timeframe.

In [89]:
file_tickers_years = os.path.join(dir_data_processing, "tickers", "tickers_years.datajson")

Iterate over all reprots and collect years of publishing for every company

In [90]:
temp_tickers = dict()
for report_file_name in os.listdir(dir_reports_ready):
    if report_file_name == ".DS_Store":
        continue
    ticker, date = report_file_name.split('-')  
    if ticker not in temp_tickers:
        temp_tickers[ticker] = set()
    year, leftovers = date.split('_')    
    temp_tickers[ticker].add(int(year))
    
tickers = dict()
for ticker, years in temp_tickers.items():
    tickers[ticker] = sorted(years)    

save years of companies

In [91]:
with open(file_tickers_years, "w") as f_w:
    for ticker in tickers:
        f_w.write("%s\n" % json.dumps({"ticker": ticker, "available_years": tickers[ticker]}))
        

Find tickers with reports for every year in the experiment timeframe

In [92]:
dir_data_tickers = os.path.join(dir_data_processing, "tickers")
file_tickers_for_analysis = os.path.join(dir_data_tickers, "ticker_for_analysis.csv")

In [93]:
tickers_fits_for_analysis = set()
tickers_all = 0
for ticker, available_years in tickers.items():
    flag_complete_series = True
    available_years_set = set(available_years)
    for year in year_series:
        if year not in available_years_set:
            flag_complete_series = False
            break
    if flag_complete_series and os.path.exists(os.path.join(dir_prices, "%s.csv" % ticker)):
        tickers_fits_for_analysis.add(ticker)

In [94]:
print("there are %s tickers available for the experiment" % len(tickers_fits_for_analysis))
for ticker in tickers_fits_for_analysis:
    print(ticker)

there are 79 tickers available for the experiment
SBRY_FTSE
AGK_FTSE
CBK_DAX
AC_CAC
TVE_DJIA
ADM_FTSE
KO_DJIA
SIE_DAX
BAYN_DAX
AIR_CAC
WPP_FTSE
PFC_FTSE
WMT_DJIA
AAL_FTSE
FRE_DAX
HEN3_DAX
IFX_DAX
BARC_FTSE
CSCO_DJIA
MCD_DJIA
BNP_CAC
TSCO_FTSE
EOAN_DAX
RWE_DAX
DAI_DAX
ACA_CAC
LIN_DAX
LLOY_FTSE
DIS_DJIA
PRU_FTSE
MKS_FTSE
UTX_DJIA
JPM_DJIA
LR_CAC
LHA_DAX
SAP_DAX
RB_FTSE
HD_DJIA
MRW_FTSE
PFE_DJIA
SHP_FTSE
CS_CAC
MC_CAC
WEIR_FTSE
VZ_DJIA
ANTO_FTSE
OR_CAC
MRO_FTSE
BMW_DAX
RI_CAC
MUV2_DAX
RBS_FTSE
ULVR_FTSE
VOD_FTSE
CVX_DJIA
SKY_FTSE
BAB_FTSE
EI_CAC
REL_FTSE
PG_DJIA
HSBC_DJIA
SSE_FTSE
DBK_DAX
RSA_FTSE
CAP_CAC
WTB_FTSE
SDF_DAX
SLA_FTSE
SNN_FTSE
DPW_DAX
EDF_CAC
BA_FTSE
RRS_FTSE
SRP_FTSE
ALV_DAX
DTE_DAX
VED_FTSE
TKA_DAX
GS_DJIA


save companies with complete years series

In [95]:
with open(file_tickers_for_analysis, "w") as f_w:
    for ticker in tickers_fits_for_analysis:
        f_w.write("%s\n" % ticker)

## Stock data completeness

In [96]:
dir_ticker_prices = os.path.join(dir_data_processing, "prices")

Iterate over tickers available for analisys and build a return table, show a logs for missing stok data

In [97]:
tickers_prices_table = {}
for ticker in tickers_fits_for_analysis:
    file_ticker_prices = os.path.join(dir_ticker_prices, ticker + ".csv")
    if os.path.isfile(file_ticker_prices):
        price_df = pd.read_csv(file_ticker_prices)
        price_df["Date"] = pd.to_datetime(price_df["Date"])
        price_df.sort_values(by=["Date"], inplace=True)
        price_df.set_index("Date", inplace=True)
        ticker_data = {}
        
        prev_day = None
        date_stat_price = dt.datetime.strptime("%s-01-01" % (date_start.year + 1), "%Y-%m-%d")
        
        for index, day in price_df[date_stat_price : date_end].iterrows():
            if prev_day is None:
                ticker_data[index] = 1
            else:
                ticker_data[index] = day["Adj Close"] / prev_day["Adj Close"]
            prev_day = day
        tickers_prices_table[ticker] = ticker_data
    else:
        print("Stock data is missing for %s" % ticker)

Convert to pandas data frame

In [98]:
df_return = pd.DataFrame.from_dict(tickers_prices_table)

In [99]:
file_return_table = os.path.join(dir_ticker_prices, "all-returns.csv")
df_return.to_csv(file_return_table)

## Industry indecies data completeness

read mapping file

In [100]:
dir_ticker_mappings = os.path.join(dir_data_raw, "topics_industries_mapping")
file_mapping = os.path.join(dir_ticker_mappings, "mapping.json")

In [101]:
ticker_industries = []
with open(file_mapping, "r") as f_r:
    mappings = json.load(f_r)
    for mapping in mappings:
        if mapping["ticker"]:
            ticker_industries.append(mapping["ticker"])

In [102]:
print(len(ticker_industries))
print(ticker_industries)

19
['XLF', 'KIE', 'XLK', 'XTN', 'XHS', 'XME', 'PEJ', 'XLC', 'XHB', 'XLB', 'XLP', 'XLV', 'XLI', 'XLRE', 'XAR', 'CARZ', 'XLE', 'XHE', 'XLY']


In [103]:
dir_ticker_industries_prices = os.path.join(dir_data_raw, "industies_indeces")

Iterate over tickers available for mapping to topics and build a return table, show a logs for missing data

In [104]:
tickers_industries_prices_table = {}
for ticker in ticker_industries:
    file_ticker_prices = os.path.join(dir_ticker_industries_prices, ticker + ".csv")
    if os.path.isfile(file_ticker_prices):
        price_df = pd.read_csv(file_ticker_prices)
        price_df["Date"] = pd.to_datetime(price_df["Date"])
        price_df.sort_values(by=["Date"], inplace=True)
        price_df.set_index("Date", inplace=True)
        ticker_data = {}
        first_date_set = False
        first_date_value = 1
        
        date_stat_price = dt.datetime.strptime("%s-01-01" % (date_start.year + 1), "%Y-%m-%d")
        
        for index, day in price_df[date_stat_price : date_end].iterrows():
            day_value = day["Adj Close"] if day["Adj Close"] else 1
            if first_date_set == False and day_value != 1:
                first_date_set = True
                first_date_value = day["Adj Close"]
                
            ticker_data[index] = day_value / first_date_value
        tickers_industries_prices_table[ticker] = ticker_data
    else:
        print("Stock data is missing for %s" % ticker)

### Take care about NaN

In [105]:
df_indeces = pd.DataFrame.from_dict(tickers_industries_prices_table)

#### Fill empty nontradable days with a value of previos tradable day

In [106]:
#df_indeces.fillna(method='ffill', inplace=True)

#### Identify those with bad series

In [107]:
df_indeces.columns[df_indeces.isna().any()].tolist()

['XTN', 'XHS', 'XME', 'XLC', 'XHB', 'XLRE', 'XAR', 'CARZ', 'XHE']

#### Fill all non complete serieses with ones

In [108]:
#df_indeces.fillna(1.0, inplace=True)

In [109]:
file_indices_table = os.path.join(dir_ticker_prices, "all-industries-indices.csv")
df_indeces.to_csv(file_indices_table)

# Build run data

## Init

In [110]:
run_prefix = "run_27_xx"

In [111]:
dir_run = os.path.join(dir_data_runs, run_prefix)

In [112]:
if not os.path.exists(dir_run):
    os.makedirs(dir_run)

## Reconstuct terms dictionary from report ready directory

In [113]:
dict_term2id = {}
dict_id2term = {}
terms_set = set()
id_counter = 0

for report_name in os.listdir(dir_reports_ready):
    full_report_name = os.path.join(dir_reports_ready, report_name)
    if os.path.isfile(full_report_name) and report_name != ".DS_Store":
        ticker, tail = report_name.split('-')
        if ticker not in tickers_fits_for_analysis:
            continue
        with open(full_report_name, 'r') as f_r:
            for text_line in f_r:
                terms = text_line.strip().split(' ')
                for term in terms:
                    if term not in terms_set:
                        terms_set.add(term)

term_list = sorted(list(terms_set))
for term in term_list:
    dict_term2id[term] = id_counter
    dict_id2term[id_counter] = term
    id_counter += 1

## Define vecorization of a report function

In [114]:
def vectorize_report_dtm(file_report):
    vector_report = list()
    document_bow = dict()
    with open(file_report, 'r') as f_r:
        for text_line in f_r:
            terms = text_line.strip().split(' ')
            for term in terms:
                term_id = dict_term2id[term]
                if term_id not in document_bow:
                    document_bow[term_id] = 0
                document_bow[term_id] += 1
            
    for term_id, term_counter in document_bow.items():
        vector_report.append("%s:%s" % (term_id, term_counter))
    
    return vector_report

## Read all reports (terms quantity map) for every ticker for every year in analysis

In [115]:
amount_documents_in_series_dict = dict()
documents_name_list = list()
documents_vector_list = list()
for year in year_series:
    amount_documents_in_series = 0
    # generate list of files for a year
    regExp = re.compile('[A-Z\d]+\_[A-Z\d]+\-' + str(year) + '\_[\d]+\.txt$')
    reports_of_year = [f for f in os.listdir(dir_reports_ready) if re.search(regExp, f)]
    reports_of_year.sort()
    # for every reports of the year
    for report_name in reports_of_year:
        ticker, tail = report_name.split('-')
        if ticker not in tickers_fits_for_analysis:
            continue
        amount_documents_in_series += 1
        documents_vector_list.append(vectorize_report_dtm(os.path.join(dir_reports_ready, report_name)))
        documents_name_list.append(report_name)

    #keep track of documents in series
    amount_documents_in_series_dict[int(year)] = amount_documents_in_series

### write results into files

save prefix-seq.dat

In [116]:
with open(os.path.join(dir_run, run_prefix + '-seq.dat'), 'w') as f_w:
    f_w.write("%s\n" % len(year_series))
    for year in sorted(amount_documents_in_series_dict.keys()):
        f_w.write("%s\n" % amount_documents_in_series_dict[year])

save prefix-mult.dat

In [117]:
with open(os.path.join(dir_run, run_prefix + '-mult.dat'), 'w') as f_w:
    for document in documents_vector_list:
        f_w.write("%s %s\n" % (len(document), ' '.join(document)))

save prefix-documents.dat, every document the same order with mult.dat

In [118]:
with open(os.path.join(dir_run, run_prefix + '-documents.dat'), 'w') as f_w:
    for document in documents_name_list:
        f_w.write("%s\n" % document)

save prefix-documents.dat, every document the same order with mult.dat

In [119]:
with open(os.path.join(dir_run, run_prefix + '-terms.dat'), 'w') as f_w:
    for term in term_list:
        f_w.write("%s\n" % term)

Copy returns to the run folder

In [120]:
shutil.copyfile(file_return_table, os.path.join(dir_run, run_prefix + '-returns.csv'))

'../data_runs/run_27_xx/run_27_xx-returns.csv'

Save run setting to a file

In [121]:
preprocesssing_settings = {
    'l1_min': l1_min,
    'max_partition_of_doc': max_partition_of_doc,
    'l1_max': l1_max,
    'l2_min': l2_min,
    'max_partition_of_ticker': max_partition_of_tickers,
    'l2_max': l2_max,
}

In [122]:
with open(os.path.join(dir_run, run_prefix + '-preprocesssing_settings.dat'), 'w') as f_w:
    f_w.write(json.dumps(preprocesssing_settings))


Copy reports for the run folder

In [123]:
dir_run_reports = os.path.join(dir_run, 'reports')

In [124]:
if not os.path.exists(dir_run_reports):
    os.makedirs(dir_run_reports)

In [125]:
for report_file_name in os.listdir(dir_reports_ready):
    path_report_src = os.path.join(dir_reports_ready, report_file_name)
    path_report_dst = os.path.join(dir_run_reports, report_file_name)
    
    if report_file_name != '.DS_Store' and os.path.isfile(path_report_src):
        shutil.copyfile(path_report_src, path_report_dst)

### create result directories

In [126]:
dir_results = os.path.join(dir_run, 'results')

if not os.path.exists(dir_results):
    os.makedirs(dir_results)

In [127]:
dir_results = os.path.join(dir_run, 'interpretation')

if not os.path.exists(dir_results):
    os.makedirs(dir_results)

# EnD