In [1]:
import csv
import json
import os
import pickle
import re
import pandas as pd
import numpy as np
import dateparser
from ipywidgets import FileUpload, widgets
from IPython.display import display

# # workaround via specifying an invalid value first
# %config Application.log_level='WORKAROUND'
# # => fails, necessary on Fedora 27, ipython3 6.2.1
# %config Application.log_level='DEBUG'
import logging
logging.getLogger().setLevel(logging.DEBUG)
log = logging.getLogger()

cache_dir = "./.cache"
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
from diskcache import Cache
cache = Cache(os.path.join(cache_dir, "diskcache"))

Select the text file to create the corpus

In [2]:
upload = FileUpload(accept=".txt", multiple=True)
display(upload)

FileUpload(value={}, accept='.txt', description='Upload', multiple=True)

In [3]:
# for development, if the value of the upload widget is empty, we 'preload' some local data

cached_input_file = os.path.join(cache_dir, "last_cached_input.pickle")
if not upload.value:
    if os.path.exists(cached_input_file):
        with open(cached_input_file, "rb") as f:
            files = pickle.load(f)
    else:
        files = {}
else:
    files = upload.value
    with open(cached_input_file, "wb") as f:
        pickle.dump(files, f)

Select the stopwords csv file

In [4]:
stopword_upload = FileUpload(accept=".csv", multiple=False)
display(stopword_upload)

FileUpload(value={}, accept='.csv', description='Upload')

In [5]:
cached_stopword_upload_file = os.path.join(cache_dir, "last_stopword_upload.pickle")
if not stopword_upload.value:
    if os.path.exists(cached_stopword_upload_file):
        with open(cached_stopword_upload_file, "rb") as f:
            stopword_content = pickle.load(f)
    else:
        stopword_content = {}
else:
    stopword_content = stopword_upload.data[0]
    with open(cached_stopword_upload_file, "wb") as f:
        pickle.dump(stopword_content, f)

stopwords_string = str(stopword_content, "utf-8")
reader = csv.reader(stopwords_string.split("\n"))

stopwords = []
for row in reader:
    if not row or row == ['stopword']:
        continue
    stopwords.append(row[0])

In [6]:
def extract(file_details):

    file_name = file_details['metadata']['name']
    ref = re.findall(r'(\w+\d+)_\d{4}-\d{2}-\d{2}_', file_name)[0]
    date = dateparser.parse(file_name.split("_")[1], settings={'TIMEZONE': 'Europe/Berlin'})
    if (ref == 'sn85066408'):
        pub_name = 'L\'Italia'
    elif (ref == '2012271201'):
        pub_name = 'Cronaca Sovversiva'
    else:
        pub_name = None

    return [file_name, date, ref, pub_name, "", np.nan, np.nan, np.nan]

data = []
for file_details in files.values():
    row = extract(file_details)
    data.append(row)
sources = pd.DataFrame(data, columns=['file_name', 'date', 'ref', 'pub_name', "text", "tokenized", "doc_prep", "doc_prep_nonstop"])
sources.set_index('file_name', inplace=True)
sources["text"] = sources.text.astype(str)
sources["tokenized"] = sources.tokenized.astype('object')
sources["doc_prep"] = sources.doc_prep.astype('object')
sources["doc_prep_nonstop"] = sources.doc_prep_nonstop.astype('object')

In [7]:
import nltk
from nltk.corpus import stopwords as st

nltk.download('punkt')
nltk.download('stopwords')

ital_stopwords = st.words('italian')
en_stopwords = st.words('english')
stopwords.extend(en_stopwords)

[nltk_data] Downloading package punkt to /home/markus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/markus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def prepare_data_for_file(row, start_date, end_date):

    if row.text:
        return None

    if row.date < start_date or row.date > end_date:
        return None

    file_name = row.name

    result = process_file(file_name)
    return result

@cache.memoize(typed=True, tag="tokenize")
def process_file(file_name):

    log.debug(f"computing: {file_name}")

    content_bytes = files[file_name]['content']
    content = ' ' + str(content_bytes, 'utf-8').replace('\n', ' ') + ' '

    tokenized = nltk.word_tokenize(content)  # TODO: language italian?
    doc_prep = [w.lower() for w in tokenized if (w.isalpha() and len(w) > 2 )]
    doc_prep_nonstop = [w for w in doc_prep if not w in stopwords]

    result = {"text": content[0:20], "tokenized": tokenized, "doc_prep": doc_prep, "doc_prep_nonstop": doc_prep_nonstop}
    log.debug(f"finished computing: {file_name}")
    return result

def get_filtered_df(start_date, end_date):
    # we only read the files in the timeframe we are interested in
    missing_text_rows = sources.apply(lambda x: prepare_data_for_file(x, start_date, end_date), axis=1)

    for k, v in missing_text_rows.items():
        if not v:
            continue

        sources.at[k, 'text'] = v['text']
        sources.at[k, 'tokenized'] = v['tokenized']
        sources.at[k, 'doc_prep'] = v['doc_prep']
        sources.at[k, 'doc_prep_nonstop'] = v['doc_prep_nonstop']

    df = sources[(sources['date']>=start_date) & (sources['date']<=end_date)]
    return df

#     # print(t['tokenized'])
#     # print(sources.loc[sources.file_name == t['file_name']])
#     sources.loc[sources.file_name == t['file_name'], 'text'] = t['text']
#     # sources.loc[sources.file_name == t['file_name'], 'tokenized'] = t['tokenized']
#     # sources.loc[sources.file_name == t['file_name'], 'text'] = t['text']
#     # sources.loc[sources.file_name == t['file_name'], 'tokenized'] = t['tokenized']
#     # sources.loc[sources.file_name == t['file_name'], 'doc_prep'] = t['doc_prep']
#     # sources.loc[sources.file_name == t['file_name'], 'doc_prep_nonstop'] = t['doc_prep_nonstop']



In [9]:
min_date = min(sources['date'])
max_date = max(sources['date'])

dates = pd.date_range(min_date, max_date, freq='D')

options = [(date.strftime(' %d %b %Y '), date) for date in dates]
index = (0, len(options)-1)

selection_range_slider = widgets.SelectionRangeSlider(
    options=options,
    index=index,
    description='Dates',
    orientation='horizontal',
    layout={'width': '500px'},
    continuous_update=False

)

filtered_table = widgets.Output()

def date_range_change_handler(change):

    start_date = change.new[0]
    end_date = change.new[1]
    df = get_filtered_df(start_date, end_date)
    filtered_table.clear_output()
    with filtered_table:
        display(df.describe())
        display(df)


selection_range_slider.observe(date_range_change_handler, names='value')
display(selection_range_slider)
display(filtered_table)

SelectionRangeSlider(continuous_update=False, description='Dates', index=(0, 5808), layout=Layout(width='500px…

Output()