## Initialization

In [1]:
import os
import re
import json
import nltk
import pandas as pd
import datetime as dt
import numpy as np
import operator
import math
from nltk.stem.wordnet import WordNetLemmatizer
import multiprocessing

### set experiment dates

In [2]:
date_start = dt.datetime.strptime('2005-01-01', '%Y-%m-%d')
date_end = dt.datetime.strptime('2018-06-30', '%Y-%m-%d') 

### set run prefix

In [3]:
run_prefix = 'run_01_01'

### set root directories

In [4]:
dir_root = os.path.join('..')
dir_data_raw = os.path.join(dir_root, 'data_raw')
dir_data_processing = os.path.join(dir_root, 'data_processing')
dir_prices = os.path.join(dir_data_processing, 'prices')

### set flags

In [5]:
flag_debug = False
flag_rerun_cleaning = True
flag_test_report_names = False

### create a run directory

In [6]:
dir_run = os.path.join(dir_data_processing, 'runs', run_prefix)

In [7]:
if not os.path.exists(dir_run):
    os.makedirs(dir_run)

## Clean reports

Make sure that all nltk data sets are available

In [8]:
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
nltk.download('omw')

[nltk_data] Downloading package wordnet to /Users/alan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/alan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/alan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw to /Users/alan/nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

In [9]:
english_words_set = set(nltk.corpus.words.words())
lemmatizer = WordNetLemmatizer().lemmatize

### Create stop words set

In [10]:
stop_words_set = set(nltk.corpus.stopwords.words('english'))

Extend stop words with custom stop words

In [11]:
with open(os.path.join(dir_data_raw, 'english', 'extra_stopwords.txt'), 'r') as f_r:
    for text_line in f_r:
        stop_words_set.add(text_line.strip())

Define a funtion checker for stop words. A word is stop word if any of the folowing true:
- it's length shorter then 4 char
- it contains digits
- it appears in nltk stop words set

In [12]:
def is_stop_term(term):
    if len(term) < 3:
        return True
    if re.search('[\d]+', term):
        return True
    return term in stop_words_set

Define a funtion lemmatizer

In [13]:
def get_word_lemma(possible_word):
    possible_lemma = lemmatizer(possible_word)
    possible_lemma = lemmatizer(possible_lemma, 'v')
    if not is_stop_term(possible_lemma) and possible_lemma in english_words_set:
        return possible_lemma
    
    return None

Define a clean report function

In [14]:
def clean_report(file_report_path):
    document = dict()
    with open(os.path.join(file_report_path), 'r',  encoding='ISO-8859-1') as f_r:
        for text_line in f_r:
            words = re.split('\W+', text_line)
            for word in words:
                word_lower = word.lower()
                term = get_word_lemma(word_lower)
                if term is not None:
                    if term in document:
                        document[term] += 1
                    else:
                        document[term] = 1
                elif flag_debug:
                    print("%s is excluded from analisys" % word_lower)
    return document

Iterate over all reports, clean and convert them to term frequency map, store as csv file

In [15]:
check_report_name_reg_exp = '(?P<ticker>[A-Z1-9]+)[_-](?P<type>[A-Z]+)(?P<number>[1-9]*)(?P<subnumber>-[1-9]+)?[-_](?P<year>[0-9]{4})[_-](?P<p_year>[0-9]{4})'


In [16]:
dir_reports_raw = os.path.join(dir_data_raw, 'reports_txt')
dir_reports =  os.path.join(dir_data_processing, 'reports')

The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

In [17]:
def ticker_reports_processing(dir_findex, ticker):
    dir_ticker = os.path.join(dir_findex, ticker)
    if os.path.isdir(dir_ticker):
        ticker_documents_amount = 0
        ticker_code = "%s_%s" % (ticker, findex)
        print("working on %s at %s" % (ticker_code, dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

        for report_file_name in os.listdir(dir_ticker):
            match = re.search(check_report_name_reg_exp, report_file_name)
            if not match:
                print("filename %s doesn't fit pattern" % report_file_name)
            else:
                dict_report = clean_report(os.path.join(dir_ticker, report_file_name))
                if len(dict_report):
                    ticker_documents_amount += 1
                    new_file_name = "%s-%s.csv" % (match.group('p_year'), ticker_documents_amount)
                    new_path = os.path.join(dir_reports, ticker_code)
                    if not os.path.exists(new_path):
                        os.makedirs(new_path)
                    with open(os.path.join(new_path, new_file_name), 'w') as f_w:
                        for term, tf in dict_report.items():
                            f_w.write("%s,%s\n" % (term, tf))
                elif flag_debug:
                    print("report %s is empty after cleaning" % report_file_name)

In [18]:
def test_reports_names(dir_findex, ticker):
    dir_ticker = os.path.join(dir_findex, ticker)
    if os.path.isdir(dir_ticker):
        ticker_documents_amount = 0
        ticker_code = "%s_%s" % (ticker, findex)
        for report_file_name in os.listdir(dir_ticker):
            match = re.search(check_report_name_reg_exp, report_file_name)
            if not match:
                print("working on %s, filename %s doesn't fit pattern" % (ticker_code, report_file_name))

In [19]:
if flag_rerun_cleaning or flag_test_report_names:
    for findex in os.listdir(dir_reports_raw):
        dir_findex = os.path.join(dir_reports_raw, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                if flag_test_report_names: 
                    pool.starmap(test_reports_names, [(dir_findex, ticker) for ticker in os.listdir(dir_findex)])
                else:
                    pool.starmap(ticker_reports_processing, [(dir_findex, ticker) for ticker in os.listdir(dir_findex)])

working on TUI1_XETRA at 2019-01-10 07:33:12
working on CA_CAC at 2019-01-10 07:33:35
working on AC_CAC at 2019-01-10 07:33:35
working on EDF_CAC at 2019-01-10 07:33:35
working on ACA_CAC at 2019-01-10 07:33:35
working on CS_CAC at 2019-01-10 07:34:11
working on ORA_CAC at 2019-01-10 07:34:46
working on AIR_CAC at 2019-01-10 07:34:59
working on ML_CAC at 2019-01-10 07:35:14
working on AI_CAC at 2019-01-10 07:35:25
working on MC_CAC at 2019-01-10 07:35:28
working on CAP_CAC at 2019-01-10 07:36:23
working on BN_CAC at 2019-01-10 07:37:11
working on RI_CAC at 2019-01-10 07:37:21
working on BNP_CAC at 2019-01-10 07:37:48
working on EN_CAC at 2019-01-10 07:37:55
working on EI_CAC at 2019-01-10 07:37:59
working on OR_CAC at 2019-01-10 07:38:50
working on ENGI_CAC at 2019-01-10 07:39:02
working on LR_CAC at 2019-01-10 07:39:09
working on ALO_CAC at 2019-01-10 07:39:39
working on KER_CAC at 2019-01-10 07:40:20
working on UU_FTSE at 2019-01-10 07:41:12
working on WEIR_FTSE at 2019-01-10 07:41:1

## Get tickers for analisys

We want to find all tickers of companies which have reports for the experiment's timeframe.

In [20]:
file_tickers_years = os.path.join(dir_data_processing, 'tickers','tickers_years.datajson')

In [21]:
tickers = dict()
for ticker in os.listdir(dir_reports):
    dir_ticker = os.path.join(dir_reports, ticker)
    if os.path.isdir(dir_ticker):
        ticker_years_set = set()
        for report in os.listdir(dir_ticker):
            ticker_years_set.add(int(report[:4]))
        tickers[ticker] = sorted(ticker_years_set)

save years of companies

In [22]:
with open(file_tickers_years, 'w') as f_w:
    for ticker in tickers:
        f_w.write("%s\n" % json.dumps({"ticker": ticker, "available_years": tickers[ticker]}))

Find tickers with reports for every year in the experiment timeframe

In [23]:
year_series = list(range(date_start.year, date_end.year)) 

In [24]:
if flag_debug:
    print(year_series)

In [25]:
dir_data_tickers = os.path.join(dir_data_processing, 'tickers')
file_tickers_for_analysis = os.path.join(dir_data_tickers,'ticker_for_analysis.csv')

In [26]:
tickers_fits_for_analysis = set()
for ticker, available_years in tickers.items():
    flag_complete_series = True
    available_years_set = set(available_years)
    for year in year_series:
        if year not in available_years_set:
            flag_complete_series = False
            break
    if flag_complete_series and os.path.exists(os.path.join(dir_prices, "%s.csv" % ticker)):
        tickers_fits_for_analysis.add(ticker)

In [27]:
"there are %s tickers available for the experiment" % len(tickers_fits_for_analysis)

'there are 78 tickers available for the experiment'

save companies with complete years series

In [28]:
with open(file_tickers_for_analysis, 'w') as f_w:
    for ticker in tickers_fits_for_analysis:
        f_w.write("%s\n" % ticker)

## Return table

### Set paths

In [29]:
dir_ticker_prices = os.path.join(dir_data_processing, 'prices')

Iterate over tickers available for analisys and build a return table

In [30]:
tickers_prices_table = {}
for ticker in tickers_fits_for_analysis:
    file_ticker_prices = os.path.join(dir_ticker_prices, ticker + '.csv')
    if os.path.isfile(file_ticker_prices):
        price_df = pd.read_csv(file_ticker_prices)
        price_df['Date'] = pd.to_datetime(price_df['Date'])
        price_df.sort_values(by=['Date'], inplace=True)
        price_df.set_index('Date', inplace=True)
        ticker_data = {}
        
        prev_day = None
        date_stat_price = dt.datetime.strptime("%s-01-01" % (date_start.year + 1), '%Y-%m-%d')
        
        for index, day in price_df[date_stat_price : date_end].iterrows():
            if prev_day is None:
                ticker_data[index] = 1
            else:
                ticker_data[index] = day['Adj Close'] / prev_day['Adj Close']
            prev_day = day
        tickers_prices_table[ticker] = ticker_data

Convert to pandas data frame

In [31]:
df_return = pd.DataFrame.from_dict(tickers_prices_table)

In [32]:
file_return_table = os.path.join(dir_run, run_prefix + '-returns.csv')
df_return.to_csv(file_return_table)

## Get corpus statistics 

In [33]:
number_of_documents = 0
term_in_documents_amount = {}

In [34]:
for ticker in tickers_fits_for_analysis:
    dir_ticker = os.path.join(dir_reports, ticker)
    if os.path.isdir(dir_ticker):
        for report in os.listdir(dir_ticker):
            if '.csv' in report:
                number_of_documents += 1  
                with open(os.path.join(dir_ticker, report), 'r') as f_r:
                    for text_line in f_r:
                        (term, amount) = text_line.strip().split(',')
                        if term not in term_in_documents_amount:
                            term_in_documents_amount[term] = {
                                'term': term,
                                'total_usage': 0,
                                'in_documents_amount': 0}
                        term_in_documents_amount[term]['total_usage'] += int(amount)
                        term_in_documents_amount[term]['in_documents_amount'] += 1


In [35]:
number_of_documents

8696

In [36]:
"there are %s terms available for the experiment" % len(term_in_documents_amount) 

'there are 27441 terms available for the experiment'

## Filtered term set

### set terms limits

In [37]:
min_number_of_doc = 1
remove_n_top_terms = 0
max_partition_of_doc = 0.5
max_number_of_doc = number_of_documents * max_partition_of_doc

### get top N most common terms

In [38]:
set_top_n_terms_filter = set()
sorted_terms = sorted(list(term_in_documents_amount.values()),
                      key=operator.itemgetter('total_usage'),
                      reverse=True)
for i in range(remove_n_top_terms):
    set_top_n_terms_filter.add(sorted_terms[i]['term'])
    if flag_debug:
        print("Excluded term: %s" % sorted_terms[i]['term'])


### build terms set

In [39]:
terms_set = set()
for term_o in sorted_terms:
    term = term_o['term']
    in_documents_amount = term_o['in_documents_amount']
    total_usage = term_o['total_usage']
    if term not in set_top_n_terms_filter:
        if in_documents_amount > min_number_of_doc:
            if in_documents_amount < max_number_of_doc:
                terms_set.add(term)
            elif flag_debug:
                print("'%s' removed by max_number_of_doc" % term)
        elif flag_debug:
            print("'%s' removed by min_number_of_doc" % term)
    elif flag_debug:
        print("'%s' removed by top_n_term_filter" % term)

In [40]:
"there are %s filtered terms available for the experiment" % len(terms_set) 

'there are 25368 filtered terms available for the experiment'

# Build run data

### Create term -> id dictionary

In [41]:
dict_term2id = {}
id_counter = 0
terms_list = sorted(terms_set)
for term in terms_list:
    dict_term2id[term] = id_counter
    id_counter += 1

Define a function report vectorization

In [42]:
def vectorize_report(file_report):
    vector_report = list()
    with open(file_report, 'r') as f_r:
        for text_line in f_r:
            term, tf = text_line.strip().split(',')
            if term in dict_term2id:
                vector_report.append("%s:%s" % (dict_term2id[term], tf))
    return vector_report

### Read all reports (terms quantity map) for every ticker for every year in analysis

In [43]:
amount_documents_in_series_dict = dict()
documents_name_list = list()
documents_vector_list = list()
for year in year_series:
    amount_documents_in_series = 0
    #for every company read reports of a year
    for ticker in sorted(tickers_fits_for_analysis):
        dir_ticker_reports = os.path.join(dir_reports, ticker)
        for report in os.listdir(dir_ticker_reports):
            if int(report[:4]) == year:
                #read a report
                amount_documents_in_series += 1
                documents_vector_list.append(vectorize_report(os.path.join(dir_ticker_reports, report)))
                documents_name_list.append(ticker + '-' + report)

    #keep track of documents in series
    amount_documents_in_series_dict[int(year)] = amount_documents_in_series

### write results into files

In [44]:
#save prefix-seq.dat
with open(os.path.join(dir_run, run_prefix + '-seq.dat'), 'w') as f_w:
    f_w.write("%s\n" % len(year_series))
    for year in sorted(amount_documents_in_series_dict.keys()):
        f_w.write("%s\n" % amount_documents_in_series_dict[year])

In [45]:
#save prefix-mult.dat, every document in SVM
with open(os.path.join(dir_run, run_prefix + '-mult.dat'), 'w') as f_w:
    for document in documents_vector_list:
        f_w.write("%s %s\n" % (len(document), ' '.join(document)))

In [46]:
#save prefix-documents.dat, every document the same order with mult.dat
with open(os.path.join(dir_run, run_prefix + '-documents.dat'), 'w') as f_w:
    for document in documents_name_list:
        f_w.write("%s\n" % document)

In [47]:
#save prefix-documents.dat, every document the same order with mult.dat
with open(os.path.join(dir_run, run_prefix + '-terms.dat'), 'w') as f_w:
    for term in terms_list:
        f_w.write("%s\n" % term)

### create result directories

In [48]:
dir_results = os.path.join(dir_run, 'results')

if not os.path.exists(dir_results):
    os.makedirs(dir_results)

In [49]:
dir_results = os.path.join(dir_run, 'plots')

if not os.path.exists(dir_results):
    os.makedirs(dir_results)

In [50]:
dir_results = os.path.join(dir_run, 'interpretation')

if not os.path.exists(dir_results):
    os.makedirs(dir_results)

# Go and run experiment

gogogogogo