# Initialization

In [1]:
import os
import shutil
import re
import json
import pandas as pd
import datetime as dt
import numpy as np
import operator
import math
import multiprocessing
import time

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline

### set root directories

In [2]:
dir_root = os.path.join('..')
dir_data_raw = os.path.join(dir_root, "data_raw")
dir_data_processing = os.path.join(dir_root, "data_processing")
dir_data_runs = os.path.join(dir_root, "data_runs")
dir_ticker_prices_source = os.path.join(dir_data_raw, "prices", "ready")
dir_ticker_prices_destination = os.path.join(dir_data_processing, "prices")

### Set reports directories

In [3]:
dir_reports_txt = os.path.join(dir_data_raw, "reports_txt")
dir_reports_words = os.path.join(dir_data_processing, "reports_words")
dir_reports_terms = os.path.join(dir_data_processing, "reports_terms")
dir_reports_grams = os.path.join(dir_data_processing, "reports_gramms")
dir_reports_ready =  os.path.join(dir_data_processing, "reports_ready")

### Set terms directories

In [4]:
dir_terms_eliminated = os.path.join(dir_data_processing, "terms_elemenated")
dir_terms_counts = os.path.join(dir_data_processing, "terms_counts")

## Set experiment dates

In [5]:
date_start = dt.datetime.strptime('2005-01-01', '%Y-%m-%d')
date_end = dt.datetime.strptime('2021-06-30', '%Y-%m-%d') 

In [6]:
date_extension_start = dt.datetime.strptime('2018-01-01', '%Y-%m-%d')
date_extension_end = dt.datetime.strptime('2021-06-30', '%Y-%m-%d') 

### Set time series

In [7]:
year_series = list(range(date_start.year, date_end.year)) 
year_extension_series = list(range(date_extension_start.year, date_extension_end.year))

In [8]:
print(year_series)
print(year_extension_series)

[2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
[2018, 2019, 2020]


### set report name RegExt

In [9]:
check_report_name_reg_exp = "(?P<ticker>[A-Z1-9]+)[_-](?P<type>[A-Z]+)(?P<number>[1-9A-Z]*)(?P<subnumber>[_-]+[0-9]+)?[_-](?P<year>[0-9]{4})[_-](?P<p_year>[0-9]{4})"


### set flags

In [10]:
flag_debug = False
flag_terms_filter_debug = False

flag_fix_reports_names = False

flag_extend_stopwords = True
flag_test_report_names = True
flag_filtering_with_bigramms = True

flag_rerun_text_2_words = True
flag_rerun_words_2_terms = True

Set tickers for which you want to run preprocessing

In [11]:
tickers_to_run = {
    'FTSE' : ['WPP'],
    'DJIA' : ['TVE']
}

In [12]:
possible_tickers_for_analysis = set()

# Reports pre-processing

## Initial clean up

First remove all but English letters and re-save reports as a sequence of lower case words consist only from letters a-z

In [13]:
regexp_to_remove = re.compile(r"[\dâºâãï½ã\_]")
regexp_to_keep = re.compile(r"[^a-z\s]")

In [14]:
def convert_raw_text_2_words(file_report_path):
    words = []
    try:
        f_r = open(file_report_path, encoding="utf8", errors='ignore')
    except OSError:
        print ("Could not open/read file: %s" % file_report_path)
        return words
    except UnicodeDecodeError:
        print ("Unicode not open/read file: %s" % file_report_path)
        return words
    except OSError:
        print ("Unknown open/read file: %s" % file_report_path)
        return words
        
        
    for text_line in f_r:
        cleaned_text = re.sub(regexp_to_keep, " ", text_line.lower())
        words_in_line = re.split("\W+", cleaned_text)
        for possible_word in words_in_line:
            word = possible_word.strip()
            if len(word) > 1:
                words.append(word)
    return words

In [15]:
def reports_2_words_processing(dir_findex, ticker, findex, years_set, check_for_inclussion=False):
    ticker_code = "%s_%s" % (ticker, findex)
    
    if check_for_inclussion and ticker_code not in possible_tickers_for_analysis:
        print("Skip %s" % ticker_code)
        return
    
    dir_ticker = os.path.join(dir_findex, ticker)
    good_documents_amount = 0
    empty_documents_amount = 0
    bad_years = set()
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            if report_file_name == ".DS_Store":
                continue
            match = re.search(check_report_name_reg_exp, report_file_name)
            if not match:
                print("filename %s doesn't fit pattern" % report_file_name)
            else:
                year = int(match.group("p_year"))
                if year not in years_set:
                    bad_years.add(year)
                    continue
                list_words = convert_raw_text_2_words(os.path.join(dir_ticker, report_file_name))
                if len(list_words):
                    good_documents_amount += 1
                    new_file_name = "%s_%s.txt" % (year, good_documents_amount)
                    new_path = os.path.join(dir_reports_words, findex, ticker)
                    if not os.path.exists(new_path):
                        os.makedirs(new_path)
                    with open(os.path.join(new_path, new_file_name), "w") as f_w:
                        f_w.write("%s" % ' '.join(list_words))
                else: 
                    empty_documents_amount += 1
                    print("report %s is empty after cleaning" % report_file_name)
        print("Done on %s, reports: %s, empty: %s, extra years available: [%s]" % 
              (ticker_code, good_documents_amount, empty_documents_amount, ", ".join(map(str, sorted(bad_years)))))                    
                    

Run previously defined function words_2_terms_processing in pool of 4 processes to speedup the cleaning, The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

**please make sure that flag_rerun_text_2_words is set to True if you want to run/re-run this preprosessing step**

In [16]:
if flag_rerun_text_2_words:
    filtering_years_set = set(year_series)
    for findex in tickers_to_run.keys():
        dir_findex = os.path.join(dir_reports_txt, findex)
        if os.path.isdir(dir_findex):
            for ticker in tickers_to_run[findex]:
                reports_2_words_processing(dir_findex, ticker, findex, filtering_years_set)       

Done on WPP_FTSE, reports: 67, empty: 0, extra years available: [1998, 1999, 2000, 2001, 2002, 2003, 2004, 2021]
Done on TVE_DJIA, reports: 35, empty: 0, extra years available: []


## Lematization and english words filter

In [17]:
import nltk
from nltk.corpus import brown

Make sure that all nltk data sets are available

In [18]:
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("words")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/Alan_Spark/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Create stop words set and white words set

#### Create stop words set

In [19]:
stop_words = nltk.corpus.stopwords.words('english')
english_words = set(nltk.corpus.words.words())
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
stop_words_set = set(stop_words)

Extend stop words with custom stop words

In [20]:
if flag_extend_stopwords:
    with open(os.path.join(dir_data_raw, "english", "extra_stopwords.txt"), "r") as f_r:
        for text_line in f_r:
            term = text_line.strip()
            stop_words_set.add(term)

#### Create a white list of words

In [21]:
white_list_set = set()
with open(os.path.join(dir_data_raw, "english", "white_stopwords.txt"), "r") as f_r:
    for text_line in f_r:
        term = text_line.strip()
        white_list_set.add(term)

#### Filter functions

### Define a funtion checker for stop words. A word is stop word if any of the folowing true:
- it's length shorter then 3 char
- it contains digits
- it appears in nltk stop words set

In [22]:
def is_stop_term(term):
    if term in white_list_set:
        return False
    if len(term) < 3:
        return True
    return term in stop_words_set

Define a funtion lemmatizer

In [23]:
import spacy

Make sure that en spacy data set is available

In [24]:
#!python -m spacy download en_core_web_sm

In [25]:
nlp = spacy.load('en_core_web_sm')#, disable=['parser', 'ner'])
#lemmatizer = WordNetLemmatizer().lemmatize
#stemmer = SnowballStemmer("english").stem

In [26]:
tokens = nlp('bank banking go going')
for token in tokens:
    print(token.lemma_ + ' ' + token.text)

bank bank
banking banking
go go
go going


#### Processing functions

In [27]:
def tokenize(text_line, terms, eliminated_terms):
    doc = nlp(text_line)
    for token in doc:
        term = token.lemma_ if token.lemma_ != "-PRON-" else token.text
        if is_stop_term(term) or term not in english_words:
            if term not in eliminated_terms:
                eliminated_terms[term] = 0
            eliminated_terms[term] += 1  
        else:
            terms.append(term)  

In [28]:
def convert_words_2_terms(file_report_path, eliminated_terms):
    terms = []
    chunk_size = 30
    with open(os.path.join(file_report_path), "r",  encoding="utf-8") as f_r:
        for text_line in f_r:
            words_in_line = re.split("\W+", text_line)
            size = len(words_in_line)
            steps = int(size / chunk_size)
            for i in range(steps):
                tokenize(" ".join(words_in_line[i*chunk_size:(i+1)*chunk_size]), terms, eliminated_terms)
            tokenize(" ".join(words_in_line[steps*chunk_size:]), terms, eliminated_terms)    
    
    print("Done with %s" % file_report_path)
    return terms

In [29]:
def words_2_terms_processing(dir_findex, ticker):
    start = time.time()
    print("\n Start on %s %s" % (dir_findex, ticker))
    
    dict_eliminated_terms = {}
    dir_ticker = os.path.join(dir_findex, ticker)
    good_documents_amount = 0
    empty_documents_amount = 0
    if os.path.isdir(dir_ticker):
        for report_file_name in os.listdir(dir_ticker):
            if report_file_name == ".DS_Store":
                continue
            terms_list = convert_words_2_terms(os.path.join(dir_ticker, report_file_name), dict_eliminated_terms)
            if len(terms_list):   
                good_documents_amount += 1
                new_path = os.path.join(dir_reports_terms, findex, ticker)
                if not os.path.exists(new_path):
                    os.makedirs(new_path)
                with open(os.path.join(new_path, report_file_name), "w") as f_w:
                    f_w.write("%s" % ' '.join(terms_list))
            else: 
                empty_documents_amount += 1
                if flag_debug:
                    print("report %s is empty after cleaning" % report_file_name)
        
        if len(dict_eliminated_terms):
            if not os.path.exists(dir_terms_eliminated):
                os.makedirs(dir_terms_eliminated)
            with open(os.path.join(dir_terms_eliminated, "%s_%s.json" % (ticker, findex)), "w") as f_w:
                json.dump(dict_eliminated_terms, f_w)
        
        end = time.time()
        print("\n Done in %s minutes on %s, good reports: %s, empty reports: %s" % 
              ("{:0.2f}".format((end - start) / 60), dir_ticker, good_documents_amount, empty_documents_amount))
    else:
        print("\n Skip non folder %s %s" % (dir_findex, ticker))

Run previously defined function words_2_terms_processing in pool of 4 processes to speedup the cleaning, The following cell takes quite a while, be carefull and do not rerun it without a reason, results are stored at file system

**please make sure that flag_rerun_words_2_terms is set to True if you want to run/re-run this preprosessing step**

In [30]:
if flag_rerun_words_2_terms:
    for findex in tickers_to_run.keys():
        dir_findex = os.path.join(dir_reports_words, findex)
        if os.path.isdir(dir_findex):
            for ticker in tickers_to_run[findex]:
                words_2_terms_processing(dir_findex, ticker)


 Start on ../data_processing/reports_words/FTSE WPP
Done with ../data_processing/reports_words/FTSE/WPP/2009_51.txt
Done with ../data_processing/reports_words/FTSE/WPP/2013_34.txt
Done with ../data_processing/reports_words/FTSE/WPP/2010_65.txt
Done with ../data_processing/reports_words/FTSE/WPP/2012_48.txt
Done with ../data_processing/reports_words/FTSE/WPP/2016_10.txt
Done with ../data_processing/reports_words/FTSE/WPP/2005_50.txt
Done with ../data_processing/reports_words/FTSE/WPP/2017_5.txt
Done with ../data_processing/reports_words/FTSE/WPP/2007_57.txt
Done with ../data_processing/reports_words/FTSE/WPP/2013_23.txt
Done with ../data_processing/reports_words/FTSE/WPP/2011_22.txt
Done with ../data_processing/reports_words/FTSE/WPP/2006_12.txt
Done with ../data_processing/reports_words/FTSE/WPP/2007_46.txt
Done with ../data_processing/reports_words/FTSE/WPP/2013_4.txt
Done with ../data_processing/reports_words/FTSE/WPP/2010_63.txt
Done with ../data_processing/reports_words/FTSE/WPP/2

## Condence bigramms and trigarams

Read all documents as data: list of list

**!Important** rerun Gramms in the main notebook before preprocessing

# EnD