# Initialization

In [1]:
import os
import re
import json
import pandas as pd
import datetime as dt
import numpy as np
import operator
import math
import multiprocessing

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline

### set experiment dates

In [2]:
date_start = dt.datetime.strptime("2005-01-01", "%Y-%m-%d")
date_end = dt.datetime.strptime("2018-06-30", "%Y-%m-%d") 

### set root directories

In [3]:
dir_root = os.path.join('..')
dir_data_raw = os.path.join(dir_root, "data_raw")
dir_data_processing = os.path.join(dir_root, "data_processing")
dir_data_runs = os.path.join(dir_root, "data_runs")
dir_prices = os.path.join(dir_data_processing, "prices")

### Set reports directories

In [4]:
dir_reports_txt = os.path.join(dir_data_raw, "reports_txt")
dir_reports_words = os.path.join(dir_data_processing, "reports_words")
dir_reports_terms = os.path.join(dir_data_processing, "reports_terms")
dir_reports_grams = os.path.join(dir_data_processing, "reports_gramms")
dir_reports_ready =  os.path.join(dir_data_processing, "reports_ready")

### Set terms directories

In [5]:
dir_terms_eliminated = os.path.join(dir_data_processing, "terms_elemenated")
dir_terms_counts = os.path.join(dir_data_processing, "terms_counts")

### set report name RegExt

In [6]:
check_report_name_reg_exp = "(?P<ticker>[A-Z1-9]+)[_-](?P<type>[A-Z]+)(?P<number>[1-9]*)(?P<subnumber>-[1-9]+)?[-_](?P<year>[0-9]{4})[_-](?P<p_year>[0-9]{4})"

### set flags

In [7]:
flag_debug = False
flag_terms_filter_debug = False

flag_extend_stopwords = True
flag_test_report_names = True
flag_filtering_with_bigramms = True

flag_rerun_text_2_words = False
flag_rerun_words_2_terms = False
flag_rerun_terms_2_gramms = False
flag_rerun_filter_terms = False ### keep it True, it generates the last reports ready data

# Test reports names

In [8]:
def test_reports_names(dir_findex, ticker):
    dir_ticker = os.path.join(dir_findex, ticker)
    if os.path.isdir(dir_ticker):
        ticker_documents_amount = 0
        ticker_code = "%s_%s" % (ticker, findex)
        for report_file_name in os.listdir(dir_ticker):
            if report_file_name == ".DS_Store":
                continue

            match = re.search(check_report_name_reg_exp, report_file_name)
            if not match:
                print("working on %s, filename %s doesn't fit pattern" % (ticker_code, report_file_name))

In [9]:
if flag_test_report_names:
    for findex in os.listdir(dir_reports_txt):
        dir_findex = os.path.join(dir_reports_txt, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                if flag_test_report_names: 
                    pool.starmap(test_reports_names, [(dir_findex, ticker) for ticker in os.listdir(dir_findex)])


# Get tickers for analisys

## Reports completeness

We want to find all tickers of companies which have reports for the experiment's timeframe.

In [10]:
file_tickers_years = os.path.join(dir_data_processing, "tickers", "tickers_years.datajson")

Iterate over all reprots and collect years of publishing for every company

In [11]:
tickers = dict()
for findex in os.listdir(dir_reports_txt):
    dir_findex = os.path.join(dir_reports_txt, findex)
    if os.path.isdir(dir_findex):
        for ticker in os.listdir(dir_findex):
            dir_ticker = os.path.join(dir_findex, ticker)
            if os.path.isdir(dir_ticker):
                ticker_years_set = set()
                for report_file_name in os.listdir(dir_ticker):
                    if report_file_name == ".DS_Store":
                        continue
                    match = re.search(check_report_name_reg_exp, report_file_name)
                    ticker_years_set.add(int(match.group("p_year")))
                tickers["%s_%s" % (ticker, findex)] = sorted(ticker_years_set)

save years of companies

In [12]:
with open(file_tickers_years, "w") as f_w:
    for ticker in tickers:
        f_w.write("%s\n" % json.dumps({"ticker": ticker, "available_years": tickers[ticker]}))
        

Find tickers with reports for every year in the experiment timeframe

In [13]:
year_series = list(range(date_start.year, date_end.year)) 

In [14]:
if flag_debug:
    print(year_series)

In [15]:
dir_data_tickers = os.path.join(dir_data_processing, "tickers")
file_tickers_for_analysis = os.path.join(dir_data_tickers, "ticker_for_analysis.csv")

In [16]:
tickers_fits_for_analysis = set()
tickers_all = 0
for ticker, available_years in tickers.items():
    flag_complete_series = True
    available_years_set = set(available_years)
    for year in year_series:
        if year not in available_years_set:
            flag_complete_series = False
            break
    if flag_complete_series and os.path.exists(os.path.join(dir_prices, "%s.csv" % ticker)):
        tickers_fits_for_analysis.add(ticker)

In [17]:
"there are %s tickers available for the experiment" % len(tickers_fits_for_analysis)

'there are 79 tickers available for the experiment'

save companies with complete years series

In [18]:
with open(file_tickers_for_analysis, "w") as f_w:
    for ticker in tickers_fits_for_analysis:
        f_w.write("%s\n" % ticker)

## Stock data completeness

In [19]:
dir_ticker_prices = os.path.join(dir_data_processing, "prices")

Iterate over tickers available for analisys and build a return table, show a logs for missing stok data

In [20]:
tickers_prices_table = {}
for ticker in tickers_fits_for_analysis:
    file_ticker_prices = os.path.join(dir_ticker_prices, ticker + ".csv")
    if os.path.isfile(file_ticker_prices):
        price_df = pd.read_csv(file_ticker_prices)
        price_df["Date"] = pd.to_datetime(price_df["Date"])
        price_df.sort_values(by=["Date"], inplace=True)
        price_df.set_index("Date", inplace=True)
        ticker_data = {}
        
        prev_day = None
        date_stat_price = dt.datetime.strptime("%s-01-01" % (date_start.year + 1), "%Y-%m-%d")
        
        for index, day in price_df[date_stat_price : date_end].iterrows():
            if prev_day is None:
                ticker_data[index] = 1
            else:
                ticker_data[index] = day["Adj Close"] / prev_day["Adj Close"]
            prev_day = day
        tickers_prices_table[ticker] = ticker_data
    else:
        print("Stock data is missing for %s" % ticker)

Convert to pandas data frame

In [21]:
df_return = pd.DataFrame.from_dict(tickers_prices_table)

In [22]:
file_return_table = os.path.join(dir_ticker_prices, "all-returns.csv")
df_return.to_csv(file_return_table)

# Reports pre-processing

## Initial clean up

First remove all but English letters and re-save reports as a sequence of lower case words consist only from letters a-z

In [23]:
regexp_to_remove = re.compile(r"[\dâºâãï½ã\_]")
regexp_to_keep = re.compile(r"[^a-z\s]")

In [24]:
def convert_raw_text_2_words(file_report_path):
    words = []
    with open(os.path.join(file_report_path), "r",  encoding="ISO-8859-1") as f_r:
        for text_line in f_r:
            cleaned_text = re.sub(regexp_to_keep, " ", text_line.lower())
            words_in_line = re.split("\W+", cleaned_text)
            for possible_word in words_in_line:
                word = possible_word.strip()
                if len(word) > 1:
                    words.append(word)
    return words

In [25]:
def reports_2_words_processing(dir_findex, ticker, findex, years_set):
    ticker_code = "%s_%s" % (ticker, findex)
    if ticker_code not in tickers_fits_for_analysis:
        print("Skip %s" % ticker_code)
        return
    
    dir_ticker = os.path.join(dir_findex, ticker)
    good_documents_amount = 0
    empty_documents_amount = 0
    
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            if report_file_name == ".DS_Store":
                continue
            match = re.search(check_report_name_reg_exp, report_file_name)
            if not match:
                print("filename %s doesn't fit pattern" % report_file_name)
            else:
                year = int(match.group("p_year"))
                if year not in years_set:
                    continue
                list_words = convert_raw_text_2_words(os.path.join(dir_ticker, report_file_name))
                if len(list_words):
                    good_documents_amount += 1
                    new_file_name = "%s_%s.txt" % (year, good_documents_amount)
                    new_path = os.path.join(dir_reports_words, findex, ticker)
                    if not os.path.exists(new_path):
                        os.makedirs(new_path)
                    with open(os.path.join(new_path, new_file_name), "w") as f_w:
                        f_w.write("%s" % ' '.join(list_words))
                else: 
                    empty_documents_amount += 1
                    if flag_debug:
                        print("report %s is empty after cleaning" % report_file_name)
        print("Done on %s, good reports: %s, empty reports: %s" % 
              (dir_ticker, good_documents_amount, empty_documents_amount))                    
                    

Run previously defined function words_2_terms_processing in pool of 4 processes to speedup the cleaning, The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

**please make sure that flag_rerun_text_2_words is set to True if you want to run/re-run this preprosessing step**

In [26]:
if flag_rerun_text_2_words:
    filtering_years_set = set(year_series)
    for findex in os.listdir(dir_reports_txt):
        dir_findex = os.path.join(dir_reports_txt, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                pool.starmap(reports_2_words_processing, [(dir_findex, ticker, findex, filtering_years_set) for ticker in os.listdir(dir_findex)])
                

## Lematization and english words filter

In [27]:
import nltk
from nltk.corpus import brown

Make sure that all nltk data sets are available

In [28]:
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("words")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Create stop words set and white words set

#### Create stop words set

In [29]:
stop_words = nltk.corpus.stopwords.words('english')
english_words = set(nltk.corpus.words.words())
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
stop_words_set = set(stop_words)

Extend stop words with custom stop words

In [30]:
if flag_extend_stopwords:
    with open(os.path.join(dir_data_raw, "english", "extra_stopwords.txt"), "r") as f_r:
        for text_line in f_r:
            term = text_line.strip()
            stop_words_set.add(term)

#### Create a white list of words

In [31]:
white_list_set = set()
with open(os.path.join(dir_data_raw, "english", "white_stopwords.txt"), "r") as f_r:
    for text_line in f_r:
        term = text_line.strip()
        white_list_set.add(term)

#### Filter functions

### Define a funtion checker for stop words. A word is stop word if any of the folowing true:
- it's length shorter then 3 char
- it contains digits
- it appears in nltk stop words set

In [32]:
def is_stop_term(term):
    if term in white_list_set:
        return False
    if len(term) < 3:
        return True
    return term in stop_words_set

Define a funtion lemmatizer

In [33]:
import spacy

Make sure that en spacy data set is available

In [34]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/Alan_Spark/opt/anaconda3/lib/python3.7/site-packages/en_core_web_sm -->
/Users/Alan_Spark/opt/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [35]:
nlp = spacy.load('en', disable=['parser', 'ner'])
#lemmatizer = WordNetLemmatizer().lemmatize
#stemmer = SnowballStemmer("english").stem

#### Processing functions

In [36]:
def tokenize(text_line, terms, eliminated_terms):
    doc = nlp(text_line)
    for token in doc:
        term = token.lemma_ if token.lemma_ != "-PRON-" else token.text
        if is_stop_term(term) or term not in english_words:
            if term not in eliminated_terms:
                eliminated_terms[term] = 0
            eliminated_terms[term] += 1  
        else:
            terms.append(term)  

In [37]:
def convert_words_2_terms(file_report_path, eliminated_terms):
    terms = []
    chunk_size = 30
    with open(os.path.join(file_report_path), "r",  encoding="utf-8") as f_r:
        for text_line in f_r:
            words_in_line = re.split("\W+", text_line)
            size = len(words_in_line)
            steps = int(size / chunk_size)
            for i in range(steps):
                tokenize(" ".join(words_in_line[i*chunk_size:(i+1)*chunk_size]), terms, eliminated_terms)
            tokenize(" ".join(words_in_line[steps*chunk_size:]), terms, eliminated_terms)    
                                 
    return terms

In [38]:
def words_2_terms_processing(dir_findex, ticker):
    dict_eliminated_terms = {}
    dir_ticker = os.path.join(dir_findex, ticker)
    good_documents_amount = 0
    empty_documents_amount = 0
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            if report_file_name == ".DS_Store":
                continue
            terms_list = convert_words_2_terms(os.path.join(dir_ticker, report_file_name), dict_eliminated_terms)
            if len(terms_list):   
                good_documents_amount += 1
                new_path = os.path.join(dir_reports_terms, findex, ticker)
                if not os.path.exists(new_path):
                    os.makedirs(new_path)
                with open(os.path.join(new_path, report_file_name), "w") as f_w:
                    f_w.write("%s" % ' '.join(terms_list))
            else: 
                empty_documents_amount += 1
                if flag_debug:
                    print("report %s is empty after cleaning" % report_file_name)
        
        if len(dict_eliminated_terms):
            if not os.path.exists(dir_terms_eliminated):
                os.makedirs(dir_terms_eliminated)
            with open(os.path.join(dir_terms_eliminated, "%s_%s.json" % (ticker, findex)), "w") as f_w:
                json.dump(dict_eliminated_terms, f_w)
        print("Done on %s, good reports: %s, empty reports: %s" % 
              (dir_ticker, good_documents_amount, empty_documents_amount))
                

Run previously defined function words_2_terms_processing in pool of 4 processes to speedup the cleaning, The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

**please make sure that flag_rerun_words_2_terms is set to True if you want to run/re-run this preprosessing step**

In [39]:
if flag_rerun_words_2_terms:
    for findex in os.listdir(dir_reports_words):
        dir_findex = os.path.join(dir_reports_words, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                pool.starmap(words_2_terms_processing, [(dir_findex, ticker) for ticker in os.listdir(dir_findex)])
                

## Summarize the eliminated words

Set flag_terms_filter_debug to True to print all the eliminated words

In [40]:
eliminated_words = {}
for file in os.listdir(dir_terms_eliminated):
    if file == '.DS_Store' or file == "all_elimintated_words":
        continue
    with open(os.path.join(dir_terms_eliminated, file), "r") as f_r:
        el = json.load(f_r)
        for word, count in el.items():
            if word not in eliminated_words:
                eliminated_words[word] = {"count": 0, "ticker": 0}
            eliminated_words[word]["count"] += count
            eliminated_words[word]["ticker"] += 1
        
print("list of eliminated terms, size(%s)" % len(eliminated_words))

with open(os.path.join(dir_terms_eliminated, "all_elimintated_words"), "w") as f_w:
    json.dump(eliminated_words, f_w)

if flag_terms_filter_debug:
    for word in sorted(eliminated_words.keys()):
        print("%s, usage - total: %s, tickers %s" % (word, eliminated_words[word]["count"], eliminated_words[word]["ticker"]))
              
              

list of eliminated terms, size(170820)


## Condence bigramms and trigarams

Read all documents as data: list of list

In [41]:
data_terms = []

In [42]:
def read_data_terms(dir_findex, ticker, data):
    dir_ticker = os.path.join(dir_findex, ticker)
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            file_path = os.path.join(dir_ticker, report_file_name)
            if not os.path.isfile(file_path):
                continue
            terms_list = []
            with open(file_path, 'r') as f_r:
                for text_line in f_r:
                    terms_list = terms_list + text_line.strip().split(" ")
            data.append(terms_list)

In [43]:
if flag_rerun_terms_2_gramms:
    for findex in os.listdir(dir_reports_terms):
        dir_findex = os.path.join(dir_reports_terms, findex)
        if os.path.isdir(dir_findex):
            for ticker in os.listdir(dir_findex):
                ticker_code = "%s_%s" % (ticker, findex)
                read_data_terms(dir_findex, ticker, data_terms) 
                print("%s data collected" % ticker_code)

In [44]:
import gensim

In [45]:
bigram = gensim.models.Phrases(data_terms, min_count=30, threshold=100) # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[data_terms], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

Re-read all docs and concatenate bi/trigramms

In [46]:
def term_2_gramms_processing(dir_findex, ticker):
    dir_ticker = os.path.join(dir_findex, ticker)
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            file_path = os.path.join(dir_ticker, report_file_name)
            if not os.path.isfile(file_path):
                continue
                
            terms_list = []
            with open(file_path, 'r') as f_r:
                for text_line in f_r:
                    terms_list = terms_list + text_line.strip().split(" ")
            terms_list = bigram_mod[terms_list]
            #terms_list = trigram_mod[bigram_mod[terms_list]]
            
            if len(terms_list):   
                new_path = os.path.join(dir_reports_grams, findex, ticker)
                if not os.path.exists(new_path):
                    os.makedirs(new_path)
                with open(os.path.join(new_path, report_file_name), "w") as f_w:
                    f_w.write("%s" % ' '.join(terms_list))
            else: 
                empty_documents_amount += 1
                if flag_debug:
                    print("report %s is empty after cleaning" % report_file_name)
        print("Done on %s" % dir_ticker)
        

Run previously defined function words_2_terms_processing in pool of 4 processes to speedup the cleaning, The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

**please make sure that flag_rerun_terms_2_gramms is set to True if you want to run/re-run this preprosessing step**

In [47]:
if flag_rerun_terms_2_gramms:
    if not os.path.exists(dir_reports_grams):
        os.makedirs(dir_reports_grams)
        
    for findex in os.listdir(dir_reports_terms):
        dir_findex = os.path.join(dir_reports_terms, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                pool.starmap(term_2_gramms_processing, [(
                    dir_findex,
                    ticker) for ticker in os.listdir(dir_findex)])

## Get corpus statistics 

In [48]:
def ticker_reports_term_counting(dir_findex, ticker, dict_terms_counts):
    number_of_repors = 0
    dir_ticker = os.path.join(dir_findex, ticker)
    terms_visited_ticker = set()
    if os.path.isdir(dir_ticker):
        ticker_documents_amount = 0
        for report_file_name in os.listdir(dir_ticker):
            file_path = os.path.join(dir_ticker, report_file_name)
            if os.path.isfile(file_path):
                terms_visited_document = set()
                number_of_repors += 1
                with open(file_path, 'r') as f_r:
                    for text in f_r:
                        for term in text.strip().split(' '):
                            if term not in dict_terms_counts:
                                dict_terms_counts[term] = {
                                    "count": 1, 
                                    "document": 1, 
                                    "tickers": 1}
                            else:
                                if term not in terms_visited_ticker:
                                    dict_terms_counts[term]["tickers"] += 1
                                if term not in terms_visited_document:
                                    dict_terms_counts[term]["document"] += 1
                                dict_terms_counts[term]["count"] += 1
                            terms_visited_ticker.add(term)
                            terms_visited_document.add(term)
    return number_of_repors

In [49]:
terms_counts = {} #key: {count: int, document: int, tickers: int}
number_of_documents = 0

for findex in os.listdir(dir_reports_terms):
    dir_findex = os.path.join(dir_reports_terms, findex)
    if os.path.isdir(dir_findex):
        for ticker in os.listdir(dir_findex):
            ticker_code = "%s_%s" % (ticker, findex)
            number_of_documents += ticker_reports_term_counting(dir_findex, ticker, terms_counts) 
            print("%s terms stats collected" % ticker_code)


if not os.path.exists(dir_terms_counts):
    os.makedirs(dir_terms_counts)            
with open(os.path.join(dir_terms_counts, 'terms.json'), 'w') as f_w:
    json.dump(terms_counts, f_w)

EDF_CAC terms stats collected
AC_CAC terms stats collected
CS_CAC terms stats collected
ACA_CAC terms stats collected
RI_CAC terms stats collected
AIR_CAC terms stats collected
CAP_CAC terms stats collected
MC_CAC terms stats collected
BNP_CAC terms stats collected
LR_CAC terms stats collected
EI_CAC terms stats collected
OR_CAC terms stats collected
AAL_FTSE terms stats collected
WEIR_FTSE terms stats collected
BARC_FTSE terms stats collected
MKS_FTSE terms stats collected
WPP_FTSE terms stats collected
PFC_FTSE terms stats collected
SSE_FTSE terms stats collected
SKY_FTSE terms stats collected
ANTO_FTSE terms stats collected
SHP_FTSE terms stats collected
PRU_FTSE terms stats collected
SNN_FTSE terms stats collected
BAB_FTSE terms stats collected
RSA_FTSE terms stats collected
MRW_FTSE terms stats collected
WTB_FTSE terms stats collected
RBS_FTSE terms stats collected
VOD_FTSE terms stats collected
REL_FTSE terms stats collected
AGK_FTSE terms stats collected
VED_FTSE terms stats col

In [50]:
print("There're %s unique terms for topic analysis" % len(terms_counts))

There're 27919 unique terms for topic analysis


In [51]:
print("There're %s reports for topic analysis" % number_of_documents)

There're 4038 reports for topic analysis


## Filtered term set by document frequency

### set terms limits

In [52]:
l_min = 3 # min_number_of_doc
max_partition_of_doc = 0.4
l_max = int(number_of_documents * max_partition_of_doc) #max_number_of_doc

In [53]:
print("f1: %s" % l_min)
print("f2: %s" % l_max)

f1: 3
f2: 1615


### build eliminated terms set

In [54]:
set_eliminated_by_max_df = set()
set_eliminated_by_min_df = set()

#terms_counts = {} #key: {count: int, document: int, tickers: int}

for term, stats in terms_counts.items():
    df = int(stats["document"] )
    if df < l_min:
        set_eliminated_by_min_df.add(term)
    elif df > l_max:
        set_eliminated_by_max_df.add(term)

print("eliminated by l_min: %s" % len(set_eliminated_by_min_df))
print("eliminated by l_max: %s" % len(set_eliminated_by_max_df))
print("eliminated %s" % (len(set_eliminated_by_min_df) + len(set_eliminated_by_max_df)))



eliminated by l_min: 6819
eliminated by l_max: 1209
eliminated 8028


Set flag_terms_filter_debug to True to print eliminated words sets

In [55]:
if flag_terms_filter_debug:
    for term in sorted(set_eliminated_by_min_df):
        print(term)


In [56]:
if flag_terms_filter_debug:
    for term in sorted(set_eliminated_by_max_df):
        print(term)

#### Processing functions

In [57]:
def filter_terms(file_report_path):
    result = []
    with open(file_report_path, 'r',  encoding='utf-8') as f_r:
        for text_line in f_r:
            terms = re.split('\W+', text_line.strip())
            for term in terms:
                if term in set_eliminated_by_min_df:
                    continue
                if term in set_eliminated_by_max_df:
                    continue
                result.append(term)
    return result

In [58]:
def terms_filtering(dir_findex, ticker, findex):
    dict_eliminated_terms = {}
    dir_ticker = os.path.join(dir_findex, ticker)
    good_documents_amount = 0
    empty_documents_amount = 0
    ticker_code = '%s_%s' % (ticker, findex)
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            full_report_name = os.path.join(dir_ticker, report_file_name)
            if os.path.isfile(full_report_name) and report_file_name != '.DS_Store':
                terms_list = filter_terms(full_report_name)
                if len(terms_list):
                    good_documents_amount += 1
                    with open(
                        os.path.join(dir_reports_ready, '%s-%s' % (ticker_code, report_file_name)),
                        'w') as f_w:
                        f_w.write('%s' % ' '.join(terms_list))
                else: 
                    empty_documents_amount += 1
                    if flag_debug:
                        print('report %s is empty after cleaning' % report_file_name)
        print('Done on %s, good reports: %s, empty reports: %s' % 
              (dir_ticker, good_documents_amount, empty_documents_amount))

Run previously defined function terms_filtering in pool of 4 processes to speedup the cleaning, The following cell doesn't take much time, feel free to experiment with l_min and l_max

**please make sure that flag_rerun_filter_terms is set to True if you want to run/re-run this preprosessing step**

In [59]:
if flag_rerun_filter_terms:
    dir_source = dir_reports_grams if flag_filtering_with_bigramms else dir_reports_terms
    
    if not os.path.exists(dir_reports_ready):
        os.makedirs(dir_reports_ready)
        
    for findex in os.listdir(dir_source):
        dir_findex = os.path.join(dir_source, findex)
        if os.path.isdir(dir_findex):
            with multiprocessing.Pool(processes=4) as pool:
                pool.starmap(terms_filtering, [(
                    dir_findex,
                    ticker,
                    findex) for ticker in os.listdir(dir_findex)])
                

# Build run data

## Init

In [60]:
run_prefix = "run_19_xx"

In [61]:
dir_run = os.path.join(dir_data_runs, run_prefix)

In [62]:
if not os.path.exists(dir_run):
    os.makedirs(dir_run)

## Reconstuct terms dictionary from report ready directory

In [63]:
dict_term2id = {}
dict_id2term = {}
terms_set = set()
id_counter = 0

for report_name in os.listdir(dir_reports_ready):
    full_report_name = os.path.join(dir_reports_ready, report_name)
    if os.path.isfile(full_report_name) and report_name != ".DS_Store":
        with open(full_report_name, 'r') as f_r:
            for text_line in f_r:
                terms = text_line.strip().split(' ')
                for term in terms:
                    if term not in terms_set:
                        terms_set.add(term)

term_list = sorted(list(terms_set))
for term in term_list:
    dict_term2id[term] = id_counter
    dict_id2term[id_counter] = term
    id_counter += 1

## Define vecorization of a report function

In [64]:
def vectorize_report_dtm(file_report):
    vector_report = list()
    document_bow = dict()
    with open(file_report, 'r') as f_r:
        for text_line in f_r:
            terms = text_line.strip().split(' ')
            for term in terms:
                term_id = dict_term2id[term]
                if term_id not in document_bow:
                    document_bow[term_id] = 0
                document_bow[term_id] += 1
            
    for term_id, term_counter in document_bow.items():
        vector_report.append("%s:%s" % (term_id, term_counter))
    
    return vector_report

## Read all reports (terms quantity map) for every ticker for every year in analysis

In [65]:
amount_documents_in_series_dict = dict()
documents_name_list = list()
documents_vector_list = list()
for year in year_series:
    amount_documents_in_series = 0
    # generate list of files for a year
    regExp = re.compile('[A-Z\d]+\_[A-Z\d]+\-' + str(year) + '\_[\d]+\.txt$')
    reports_of_year = [f for f in os.listdir(dir_reports_ready) if re.search(regExp, f)]
    reports_of_year.sort()
    # for every reports of the year
    for report_name in reports_of_year:
        amount_documents_in_series += 1
        documents_vector_list.append(vectorize_report_dtm(os.path.join(dir_reports_ready, report_name)))
        documents_name_list.append(report_name)

    #keep track of documents in series
    amount_documents_in_series_dict[int(year)] = amount_documents_in_series

### write results into files

In [66]:
#save prefix-seq.dat
with open(os.path.join(dir_run, run_prefix + '-seq.dat'), 'w') as f_w:
    f_w.write("%s\n" % len(year_series))
    for year in sorted(amount_documents_in_series_dict.keys()):
        f_w.write("%s\n" % amount_documents_in_series_dict[year])

In [67]:
#save prefix-mult.dat
with open(os.path.join(dir_run, run_prefix + '-mult.dat'), 'w') as f_w:
    for document in documents_vector_list:
        f_w.write("%s %s\n" % (len(document), ' '.join(document)))

In [68]:
#save prefix-documents.dat, every document the same order with mult.dat
with open(os.path.join(dir_run, run_prefix + '-documents.dat'), 'w') as f_w:
    for document in documents_name_list:
        f_w.write("%s\n" % document)

In [69]:
#save prefix-documents.dat, every document the same order with mult.dat
with open(os.path.join(dir_run, run_prefix + '-terms.dat'), 'w') as f_w:
    for term in term_list:
        f_w.write("%s\n" % term)

### create result directories

In [70]:
dir_results = os.path.join(dir_run, 'results')

if not os.path.exists(dir_results):
    os.makedirs(dir_results)

In [71]:
dir_results = os.path.join(dir_run, 'interpretation')

if not os.path.exists(dir_results):
    os.makedirs(dir_results)

# EnD