# Exploring your TroveHarvester data

In [None]:
import os
import pandas as pd # makes manipulating the data easier
import plotly.offline as py # for charts
import plotly.graph_objs as go

py.init_notebook_mode() # initialise plotly

In [None]:
def get_latest_harvest():
    '''
    Get the timestamp of the most recent harvest.
    '''
    harvests = sorted(os.listdir('data'))
    return harvests[-1]

In [None]:
def open_harvest_data(timestamp=None):
    '''
    Open the results of the specified harvest (most recent by default).
    
    Returns a DataFrame.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    print(timestamp)
    df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'))
    return df  

In [None]:
df = open_harvest_data()

In [None]:
# Let's have a look!
df

In [None]:
# Get the most common newspapers
newspaper_counts = df['newspaper_title'].value_counts()
top_newspapers = newspaper_counts[:20]
newspaper_counts

In [None]:
# Chart the most common newspapers
trace = go.Bar (
            x=top_newspapers.index.values,
            y=top_newspapers.values
        )
plot_data = [trace]
py.iplot(plot_data, filename='top-newspapers')


In [None]:
# Get the date distribition of articles
date_counts= df['date'].value_counts().sort_index()

In [None]:
# Plot the date distribution
trace = go.Scatter (
            x=date_counts.index.values,
            y=date_counts.values
        )
plot_data = [trace]
py.iplot(plot_data, filename='number-by-date')

## Explore article texts in Voyant Tools

First we need to zip up all the little text files for easy transport.

In [None]:
import zipfile

def zip_harvest_texts(timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp)
    texts_dir = os.path.join(data_dir, 'text')
    with zipfile.ZipFile(os.path.join(data_dir, '{}-texts.zip'.format(timestamp)), 'w', zipfile.ZIP_DEFLATED) as zip_file:
        text_files = [t for t in os.listdir(texts_dir) if t[-4:] == '.txt']
        for text_file in text_files:   
            zip_file.write(os.path.join(texts_dir, text_file), text_file)

zip_harvest_texts()

In [None]:
timestamp = get_latest_harvest()
df['title'].to_csv(os.path.join('data', timestamp, 'titles.txt'), index=False, index_label=False)

In [None]:
from IPython.display import display, HTML
display(HTML("<iframe style='width: 100%; height: 800px;' src='//voyant-tools.org/?view=Cirrus&corpus=ffc0ad785961dadd9cb00e698822e4ea'></iframe>"))