!Important the notebook should be run after pre-processing notebook

# Initialization

In [12]:
import os
import datetime as dt
import re
import glob 


## Experiment period

In [2]:
date_start = dt.datetime.strptime("2005-01-01", "%Y-%m-%d")
date_end = dt.datetime.strptime("2018-06-30", "%Y-%m-%d") 

In [43]:
year_series = list(range(date_start.year, date_end.year)) 
year_set = set(year_series)

### set run

In [69]:
run = 19

In [70]:
run_prefix = 'run_%s_xx' % run

## Paths

In [71]:
dir_root = os.path.join('..')
dir_data_raw = os.path.join(dir_root, "data_raw")
dir_data_processing = os.path.join(dir_root, "data_processing")

dir_reports_txt = os.path.join(dir_data_raw, "reports_txt")

dir_reports_words = os.path.join(dir_data_processing, "reports_words")
dir_reports_terms = os.path.join(dir_data_processing, "reports_terms")
dir_reports_gramms = os.path.join(dir_data_processing, "reports_gramms")
dir_reports_ready = os.path.join(dir_data_processing, "reports_ready")

dir_data_runs = os.path.join(dir_root, 'data_runs')
dir_run = os.path.join(dir_data_runs, run_prefix)

In [45]:
check_report_name_reg_exp = "(?P<ticker>[A-Z1-9]+)[_-](?P<type>[A-Z]+)(?P<number>[1-9]*)(?P<subnumber>-[1-9]+)?[-_](?P<year>[0-9]{4})[_-](?P<p_year>[0-9]{4})"


# Companies/reports stats

In [35]:
dir_data_tickers = os.path.join(dir_data_processing, 'tickers')
file_tickers_for_analysis = os.path.join(dir_data_tickers, 'ticker_for_analysis.csv')

In [38]:
tickers_fits_for_analysis = set()

with open(file_tickers_for_analysis, 'r') as f_r:
    for text_line in f_r:
        ticker = text_line.strip()
        tickers_fits_for_analysis.add(ticker)

# Textual stats

define a function to count dictionary size and average lenght

In [50]:
def get_text_stats(reports):
    all_words_counter = 0
    real_reports_counter = 0
    dictionary = set()

    for report in reports:
        with open(report, 'r', encoding='ISO-8859-1') as f_r: #todo check encoding in pre-processing notebook
            real_reports_counter += 1
            for text_line in f_r:
                words = re.split('\s+', text_line.strip())
                all_words_counter += len(re.split('\s+', text_line.strip()))
                for word in words:
                    dictionary.add(word)
    
    return {
        "reports": real_reports_counter,
        "average_length": all_words_counter / real_reports_counter,
        "dictionary size": len(dictionary),
    }

## Raw text

Count the length of every document available for study, raw in words

In [51]:
path_glob = os.path.join(dir_reports_txt, '**', '**','*.txt')
print(path_glob)

print(get_text_stats(sorted(glob.glob(path_glob))))

../data_raw/reports_txt/**/**/*.txt
{'reports': 8039, 'average_length': 38169.49035949745, 'dictionary size': 3002785}


Count the length of every document eligible for study, raw in words

In [52]:
path_glob = os.path.join(dir_reports_txt, '**', '**','*.txt')
print(path_glob)

reports = []
for report in sorted(glob.glob(path_glob)):
    components = report.split('/')
    ticker = '%s_%s' % (components[4], components[3])
    if ticker not in tickers_fits_for_analysis:
        continue
    match = re.search(check_report_name_reg_exp, components[-1])
    publishing_year = int(match.group("p_year"))

    if publishing_year not in year_set:
        continue    
    reports.append(report)

print(get_text_stats(reports))

../data_raw/reports_txt/**/**/*.txt
{'reports': 4053, 'average_length': 43889.377004687885, 'dictionary size': 2064821}


Count the length of every document eligible for study, superfluous character removaland lowercase conv in words

In [57]:
path_glob = os.path.join(dir_reports_words, '**', '**','*.txt')
print(path_glob)

reports = []
for report in sorted(glob.glob(path_glob)):
    components = report.split('/')
    
    ticker = '%s_%s' % (components[4], components[3])
    if ticker not in tickers_fits_for_analysis:
        continue
    
    match = components[-1].split('_')
    publishing_year = int(match[0])
    if publishing_year not in year_set:
        continue    
    
    reports.append(report)

print(get_text_stats(reports))

../data_processing/reports_words/**/**/*.txt
{'reports': 4042, 'average_length': 34792.778080158336, 'dictionary size': 206804}


Count the length of every document eligible for study, noise removaland information concentration in words

In [58]:
path_glob = os.path.join(dir_reports_terms, '**', '**','*.txt')
print(path_glob)

reports = []
for report in sorted(glob.glob(path_glob)):
    components = report.split('/')
    
    ticker = '%s_%s' % (components[4], components[3])
    if ticker not in tickers_fits_for_analysis:
        continue
    
    match = components[-1].split('_')
    publishing_year = int(match[0])
    if publishing_year not in year_set:
        continue    
    
    reports.append(report)

print(get_text_stats(reports))

../data_processing/reports_terms/**/**/*.txt
{'reports': 4039, 'average_length': 18957.568457538993, 'dictionary size': 27918}


Count the length of every document eligible for study, bigramms information concentration in words

In [63]:
path_glob = os.path.join(dir_reports_gramms, '**', '**','*.txt')
print(path_glob)

reports = []
for report in sorted(glob.glob(path_glob)):
    components = report.split('/')
    
    ticker = '%s_%s' % (components[4], components[3])
    if ticker not in tickers_fits_for_analysis:
        continue
    
    match = components[-1].split('_')
    publishing_year = int(match[0])
    if publishing_year not in year_set:
        continue    
    
    reports.append(report)

print(get_text_stats(reports))

../data_processing/reports_gramms/**/**/*.txt
{'reports': 4039, 'average_length': 18863.85739044318, 'dictionary size': 29746}


Count the length of every document eligible for study, l_min l_max filters in words

In [66]:
path_glob = os.path.join(dir_reports_ready,'*.txt')
print(path_glob)

print(get_text_stats(glob.glob(path_glob)))

../data_processing/reports_ready/*.txt
{'reports': 4039, 'average_length': 2208.8395642485766, 'dictionary size': 13621}


Count the length of average length of a document from dtm data

In [75]:
dir_run
file_mult = os.path.join(dir_run, '%s-mult.dat' % run_prefix)
with (open(file_mult, 'r')) as f_r:
    real_reports_counter = 0
    all_words_counter = 0
    for text_line in f_r:
        real_reports_counter += 1
        all_words_counter += int(text_line.strip().split(' ')[0])
    print({
        "reports": real_reports_counter,
        "average_length": all_words_counter / real_reports_counter,
    }) 

{'reports': 4039, 'average_length': 710.177766773954}


# End