In [1]:
import copy
import csv
import json
import os
import pickle
import re
import time
from typing import Union, Optional
import pandas as pd
import numpy as np
import dateparser
from ipywidgets import FileUpload, widgets
from IPython.display import display
import datetime

# # workaround via specifying an invalid value first
# %config Application.log_level='WORKAROUND'
# # => fails, necessary on Fedora 27, ipython3 6.2.1
# %config Application.log_level='DEBUG'
import logging
logging.getLogger().setLevel(logging.DEBUG)
log = logging.getLogger()

cache_dir = "./.cache"
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
from diskcache import Cache
cache = Cache(os.path.join(cache_dir, "diskcache"))


In [2]:
import nltk
from nltk.corpus import stopwords as st

def create_sources_data_frame(data=None):
    sources = pd.DataFrame(data, columns=['file_name', 'date'])
    sources.set_index('file_name', inplace=True)

    return sources

sources = create_sources_data_frame()
all_uploaded_files = {}

nltk.download('punkt')
nltk.download('stopwords')

ital_stopwords = st.words('italian')
en_stopwords = st.words('english')

stopwords = []


[nltk_data] Downloading package punkt to /home/markus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/markus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# widgets
input_widgets = widgets.Output()
input_details_widget = widgets.Output()
data_widget = widgets.Output()

start_input_processing_button = widgets.Button(description="Process input", disabled=True)

upload_data = FileUpload(accept=".txt", multiple=True)
upload_stopword = FileUpload(accept=".csv", multiple=False)
dates = [pd.Timestamp.now(), pd.Timestamp.now()]
options = [(date.strftime(' %d %b %Y '), date) for date in dates]
index = (0, len(options)-1)
selection_range_slider = widgets.SelectionRangeSlider(
    options=options,
    index=index,
    description='Dates',
    orientation='horizontal',
    layout={'width': '500px'},
    continuous_update=False
)
selection_range_slider.disabled = True

filtered_table = widgets.Output()

In [4]:
def disable_input(disable: bool):

    if disable:
        selection_range_slider.disabled = True
        upload_data.disabled = True
        upload_stopword.disabled = True
        start_input_processing_button.disabled = True
    else:

        if len(sources) > 0:
            if len(stopwords) > 0:
                start_input_processing_button.disabled = False
            else:
                start_input_processing_button.disabled = True
            selection_range_slider.disabled = False
        else:
            selection_range_slider.disabled = True
            start_input_processing_button.disabled = True

        upload_data.disabled = False
        upload_stopword.disabled = False

# event-handling methods
def refresh_input_details():

    disable_input(False)

    start_date = selection_range_slider.value[0]
    end_date = selection_range_slider.value[1]

    filtered_sources = sources[(sources['date']>=start_date) & (sources['date']<=end_date)]

    input_details_widget.clear_output()
    with input_details_widget:
        print(f"Selected timeframe: {start_date} - {end_date}")
        print(f"Number of stopwords: {len(stopwords)}")
        display(filtered_sources)
        display(start_input_processing_button)

def extract_date(file_details):

    file_name = file_details['metadata']['name']
    _date = dateparser.parse(file_name.split("_")[1], settings={'TIMEZONE': 'Europe/Berlin'})
    return (file_name, _date)

def prepare_data_for_file(row):

    file_name = row.name

    result = process_file_content(file_name)
    result["file_name"] = file_name
    result["date"] = row.date

    ref = re.findall(r'(\w+\d+)_\d{4}-\d{2}-\d{2}_', file_name)[0]
    if (ref == 'sn85066408'):
        pub_name = 'L\'Italia'
    elif (ref == '2012271201'):
        pub_name = 'Cronaca Sovversiva'
    else:
        pub_name = None

    result["ref"] = ref
    result["pub_name"] = pub_name
    return result

@cache.memoize(typed=True, tag="tokenize")
def process_file_content(file_name):

    log.debug(f"computing: {file_name}")

    content_bytes = all_uploaded_files[file_name]['content']
    content = ' ' + str(content_bytes, 'utf-8').replace('\n', ' ') + ' '

    tokenized = nltk.word_tokenize(content)  # TODO: language italian?
    doc_prep = [w.lower() for w in tokenized if (w.isalpha() and len(w) > 2 )]
    doc_prep_nonstop = [w for w in doc_prep if not w in stopwords]

    result = {"text": content[0:20], "tokenized": tokenized, "doc_prep": doc_prep, "doc_prep_nonstop": doc_prep_nonstop}
    log.debug(f"finished computing: {file_name}")
    return result

def get_filtered_df(start_date, end_date):
    # we only read the files in the timeframe we are interested in

    # df["text"] = sources.text.astype(str)
    # df["tokenized"] = sources.tokenized.astype('object')
    # df["doc_prep"] = sources.doc_prep.astype('object')
    # df["doc_prep_nonstop"] = sources.doc_prep_nonstop.astype('object')

    processed_data = sources[(sources['date']>=start_date) & (sources['date']<=end_date)].apply(lambda x: prepare_data_for_file(x), axis=1)
    df = pd.DataFrame(processed_data.to_list(), index=processed_data.index)
    return df

    # for k, v in processed_data.items():
    #     if not v:
    #         continue
    #
    #     df.at[k, 'text'] = v['text']
    #     df.at[k, 'tokenized'] = v['tokenized']
    #     df.at[k, 'doc_prep'] = v['doc_prep']
    #     df.at[k, 'doc_prep_nonstop'] = v['doc_prep_nonstop']
    #
    # return df


def display_update(start_date, end_date):

    df = get_filtered_df(start_date, end_date)
    filtered_table.clear_output()
    with filtered_table:
        display(df.describe())
        display(df)

def date_range_change_handler(change):

    disable_input(True)
    # print(change.new)

    # start_date = change.new[0]
    # end_date = change.new[1]

    refresh_input_details()


def input_file_change_handler(change):

    # upload_data.disabled = True
    disable_input(True)

    data = {}
    for file_details in change.new.values():
        file_id, _date = extract_date(file_details)
        data[file_id] = _date
        all_uploaded_files[file_id] = file_details

    for file_id, _date in data.items():
        if file_id in sources:
            log.error(f"Duplicate file: {file_id}")
            continue

        sources.loc[file_id] = (_date,)

    min_date = min(sources['date'])
    max_date = max(sources['date'])

    dates = pd.date_range(min_date, max_date, freq='D')
    if len(dates) == 1:
        dates = (dates[0], dates[0])
    for date in dates:
        print(date)
        print(type(date))
    options = [(date.strftime(' %d %b %Y '), date) for date in dates]

    selection_range_slider.options = options
    selection_range_slider.index = (0, len(options)-1)

    upload_data.value.clear()
    upload_data._counter = 0

    refresh_input_details()

    # upload_data.disabled = False


def stopwords_input_change_handler(change):

    disable_input(True)

    stopwords.clear()
    stopword_content = upload_stopword.data[0]
    stopwords_string = str(stopword_content, "utf-8")

    reader = csv.reader(stopwords_string.split("\n"))
    for row in reader:
        if not row or row == ['stopword']:
            continue
        stopwords.append(row[0])

    stopwords.extend(en_stopwords)

    refresh_input_details()

def start_input_processing(event_source):

    disable_input(True)

    data_widget.clear_output()

    with data_widget:
        print("tokenizing input data...")

    start_date = selection_range_slider.value[0]
    end_date = selection_range_slider.value[1]

    filtered_df = get_filtered_df(start_date=start_date, end_date=end_date)

    data_widget.clear_output()

    with data_widget:

        display(filtered_df)

    disable_input(False)


## Input

In [5]:
upload_data.observe(input_file_change_handler, names=['value'])
upload_stopword.observe(stopwords_input_change_handler, names=['value'])
selection_range_slider.observe(date_range_change_handler, names=['value'])

with input_widgets:
    print("Input data (text files)")
    display(upload_data)
    print("Stopword (csv format)")
    display(upload_stopword)
    print("Select corpus timeframe")
    display(selection_range_slider)

display(input_widgets)
refresh_input_details()

Output()

2020-12-21
<class 'datetime.date'>
2020-12-21
<class 'datetime.date'>


## Input details

In [6]:
display(input_details_widget)
start_input_processing_button.on_click(start_input_processing)

Output()

## Data

In [9]:
display(data_widget)

Output()

In [10]:
# for development, if the value of the upload widget is empty, we 'preload' some local data

# cached_input_file = os.path.join(cache_dir, "last_cached_input.pickle")
# if not upload.value:
#     if os.path.exists(cached_input_file):
#         with open(cached_input_file, "rb") as f:
#             files = pickle.load(f)
#     else:
#         files = {}
# else:
#     files = upload.value
#     with open(cached_input_file, "wb") as f:
#         pickle.dump(files, f)


In [11]:
# cached_stopword_upload_file = os.path.join(cache_dir, "last_stopword_upload.pickle")
# if not stopword_upload.value:
#     if os.path.exists(cached_stopword_upload_file):
#         with open(cached_stopword_upload_file, "rb") as f:
#             stopword_content = pickle.load(f)
#     else:
#         stopword_content = {}
# else:
#     stopword_content = stopword_upload.data[0]
#     with open(cached_stopword_upload_file, "wb") as f:
#         pickle.dump(stopword_content, f)
#
# stopwords_string = str(stopword_content, "utf-8")
# reader = csv.reader(stopwords_string.split("\n"))
#
# stopwords = []
# for row in reader:
#     if not row or row == ['stopword']:
#         continue
#     stopwords.append(row[0])

In [12]:
# min_date = min(sources['date'])
# max_date = max(sources['date'])
#
# dates = pd.date_range(min_date, max_date, freq='D')
#
# options = [(date.strftime(' %d %b %Y '), date) for date in dates]
# index = (0, len(options)-1)
#
# selection_range_slider = widgets.SelectionRangeSlider(
#     options=options,
#     index=index,
#     description='Dates',
#     orientation='horizontal',
#     layout={'width': '500px'},
#     continuous_update=False
#
# )
#
#
#
# selection_range_slider.observe(date_range_change_handler, names='value')
# display(selection_range_slider)
# display(filtered_table)