!Important the notebook should be run after pre-processing notebook

# Initialization

In [1]:
import os
import datetime as dt
import re
import glob 


## Experiment period

In [2]:
date_start = dt.datetime.strptime("2005-01-01", "%Y-%m-%d")
date_end = dt.datetime.strptime("2021-06-30", "%Y-%m-%d") 

In [3]:
year_series = list(range(date_start.year, date_end.year)) 
year_set = set(year_series)

print(year_series)

[2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]


### set run

In [4]:
run = '19-ext_22'

In [5]:
run_prefix = 'run_%s' % run

## Paths

In [6]:
dir_root = os.path.join('..')
dir_data_raw = os.path.join(dir_root, "data_raw")
dir_data_processing = os.path.join(dir_root, "data_processing")

dir_reports_pdf = os.path.join(dir_data_raw, "reports_pdf")
dir_reports_txt = os.path.join(dir_data_raw, "reports_txt")

dir_reports_words = os.path.join(dir_data_processing, "reports_words")
dir_reports_terms = os.path.join(dir_data_processing, "reports_terms")
dir_reports_gramms = os.path.join(dir_data_processing, "reports_gramms")
dir_reports_ready = os.path.join(dir_data_processing, "reports_ready")
dir_reports_ready_extension = os.path.join(dir_data_processing, "reports_ready_extentsion")

dir_data_runs = os.path.join(dir_root, 'data_runs')
dir_run = os.path.join(dir_data_runs, run_prefix)

In [7]:
check_report_name_reg_exp = "(?P<ticker>[A-Z1-9]+)[_-](?P<type>[A-Z]+)(?P<number>[1-9A-Z]*)(?P<subnumber>[_-]+[0-9]+)?[_-](?P<year>[0-9]{4})[_-](?P<p_year>[0-9]{4})"


# Companies/reports stats

In [8]:
dir_data_tickers = os.path.join(dir_data_processing, 'tickers')
file_tickers_for_analysis = os.path.join(dir_data_tickers, 'ticker_for_analysis.csv')

In [9]:
tickers_fits_for_analysis = set()

with open(file_tickers_for_analysis, 'r') as f_r:
    for text_line in f_r:
        ticker = text_line.strip()
        tickers_fits_for_analysis.add(ticker)

# Textual stats

define a function to count dictionary size and average lenght

In [10]:
def get_text_stats(reports):
    all_words_counter = 0
    real_reports_counter = 0
    dictionary = set()

    for report in reports:
        with open(report, 'r', encoding='ISO-8859-1') as f_r: #todo check encoding in pre-processing notebook
            real_reports_counter += 1
            for text_line in f_r:
                words = re.split('\s+', text_line.strip())
                all_words_counter += len(re.split('\s+', text_line.strip()))
                for word in words:
                    dictionary.add(word)
    
    return {
        "reports": real_reports_counter,
        "average_length": all_words_counter / (real_reports_counter if real_reports_counter > 0 else 1),
        "dictionary size": len(dictionary),
    }

# Reports PDF

Remove -copy from pdfs if there any

In [33]:
path_glob_copy = os.path.join(dir_reports_pdf, '**', '**','*-copy.pdf')
print('"Copy" files to investigate: %s ' % len(glob.glob(path_glob_copy)))

removed = 0
for report in sorted(glob.glob(path_glob_copy)):
    original_report = report.replace('-copy', '');
    if os.path.isfile(original_report):
        removed += 1
        os.remove(report)

print('Removed %s "Copy" files' % removed)
      

"Copy" files to investigate: 0 
Removed 0 "Copy" files


In [34]:
path_glob = os.path.join(dir_reports_pdf, '**', '**','*.pdf')
print(path_glob)
    
print(len(glob.glob(path_glob)))


../data_raw/reports_pdf/**/**/*.pdf
9869


## Raw text

Count the length of every document available for study, raw in words

In [12]:
path_glob = os.path.join(dir_reports_txt, '**', '**','*.txt')
print(path_glob)

print(get_text_stats(sorted(glob.glob(path_glob))))

../data_raw/reports_txt/**/**/*.txt
{'reports': 9866, 'average_length': 39659.459051287246, 'dictionary size': 3374204}


Count the length of every document eligible for study, raw in words

In [13]:
path_glob = os.path.join(dir_reports_txt, '**', '**','*.txt')
print(path_glob)

reports = []
for report in sorted(glob.glob(path_glob)):
    components = report.split('/')
    ticker = '%s_%s' % (components[4], components[3])
    if ticker not in tickers_fits_for_analysis:
        continue
    match = re.search(check_report_name_reg_exp, components[-1])
    publishing_year = None
    
    if match:
        publishing_year = int(match.group("p_year"))
    else:
        print(components[-1])
        

    if publishing_year not in year_set:
        continue    
    reports.append(report)

print(get_text_stats(reports))

../data_raw/reports_txt/**/**/*.txt
{'reports': 5613, 'average_length': 43848.557277748085, 'dictionary size': 2370384}


Count the length of every document eligible for study, superfluous character removaland lowercase conv in words

In [14]:
path_glob = os.path.join(dir_reports_words, '**', '**','*.txt')
print(path_glob)

reports = []
for report in sorted(glob.glob(path_glob)):
    components = report.split('/')
    
    ticker = '%s_%s' % (components[4], components[3])
    if ticker not in tickers_fits_for_analysis:
        continue
    
    match = components[-1].split('_')
    publishing_year = int(match[0])
    if publishing_year not in year_set:
        continue    
    
    reports.append(report)

print(get_text_stats(reports))

../data_processing/reports_words/**/**/*.txt
{'reports': 5598, 'average_length': 34730.845837799214, 'dictionary size': 238893}


Count the length of every document eligible for study, noise removaland information concentration in words

In [15]:
path_glob = os.path.join(dir_reports_terms, '**', '**','*.txt')
print(path_glob)

reports = []
for report in sorted(glob.glob(path_glob)):
    components = report.split('/')
    
    ticker = '%s_%s' % (components[4], components[3])
    if ticker not in tickers_fits_for_analysis:
        continue
    
    match = components[-1].split('_')
    publishing_year = int(match[0])
    if publishing_year not in year_set:
        continue    
    
    reports.append(report)

print(get_text_stats(reports))

../data_processing/reports_terms/**/**/*.txt
{'reports': 5596, 'average_length': 18968.560400285918, 'dictionary size': 29356}


Count the length of every document eligible for study, bigramms information concentration in words

In [16]:
path_glob = os.path.join(dir_reports_gramms, '**', '**','*.txt')
print(path_glob)

reports = []
for report in sorted(glob.glob(path_glob)):
    components = report.split('/')
    
    ticker = '%s_%s' % (components[4], components[3])
    if ticker not in tickers_fits_for_analysis:
        continue
    
    match = components[-1].split('_')
    publishing_year = int(match[0])
    if publishing_year not in year_set:
        continue    
    
    reports.append(report)

print(get_text_stats(reports))

../data_processing/reports_gramms/**/**/*.txt
{'reports': 5596, 'average_length': 18884.917619728378, 'dictionary size': 31513}


Count the length of every document eligible for study, l_min l_max filters in words

In [17]:
path_glob = os.path.join(dir_reports_ready,'*.txt')
print(path_glob)

print(get_text_stats(glob.glob(path_glob)))

../data_processing/reports_ready/*.txt
{'reports': 5649, 'average_length': 9483.41706496725, 'dictionary size': 24570}


In [18]:
path_glob = os.path.join(dir_reports_ready_extension,'*.txt')
print(path_glob)

print(get_text_stats(glob.glob(path_glob)))

../data_processing/reports_ready_extentsion/*.txt
{'reports': 5649, 'average_length': 2536.343423614799, 'dictionary size': 21486}


Count the length of average length of a document from dtm data

In [19]:
dir_run
file_mult = os.path.join(dir_run, '%s-mult.dat' % run_prefix)
with (open(file_mult, 'r')) as f_r:
    real_reports_counter = 0
    all_words_counter = 0
    for text_line in f_r:
        real_reports_counter += 1
        all_words_counter += int(text_line.strip().split(' ')[0])
    print({
        "reports": real_reports_counter,
        "average_length": all_words_counter / real_reports_counter,
    }) 

{'reports': 5344, 'average_length': 702.2249251497007}


# End