# Exploring your TroveHarvester data

In [None]:
import os
import pandas as pd # makes manipulating the data easier
import plotly.offline as py # for charts
import plotly.graph_objs as go

py.init_notebook_mode() # initialise plotly

In [None]:
def get_latest_harvest():
    '''
    Get the timestamp of the most recent harvest.
    '''
    harvests = sorted(os.listdir('data'))
    return harvests[-1]

In [None]:
def open_harvest_data(timestamp=None):
    '''
    Open the results of the specified harvest (most recent by default).
    
    Returns a DataFrame.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    print(timestamp)
    df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'))
    return df  

In [None]:
df = open_harvest_data()

In [None]:
# Let's have a look!
df

In [None]:
# Get the most common newspapers
newspaper_counts = df['newspaper_title'].value_counts()
top_newspapers = newspaper_counts[:20]
top_newspapers

In [None]:
# Chart the most common newspapers
trace = go.Bar (
            x=top_newspapers.index.values,
            y=top_newspapers.values
        )
layout = go.Layout (
            margin=go.Margin(
                l=0,
                r=100,
                b=150,
                t=50,
                pad=4
            ),
            title='Most common newspapers'
        )
plot_data = [trace]
fig = go.Figure(data=plot_data, layout=layout)
py.iplot(fig, filename='top-newspapers')


## Show when the articles were published

In [None]:
# Get the date distribition of articles
date_counts= df['date'].value_counts().sort_index()

In [None]:
# Plot the date distribution
trace = go.Bar (
            x=date_counts.index.values,
            y=date_counts.values
        )
layout = go.Layout(
    yaxis=dict(
        rangemode='tozero',
        title='Number of articles'
    ),
    xaxis=dict(
        title='Date'
    )
)
plot_data = [trace]
fig = go.Figure(data=plot_data, layout=layout)
py.iplot(fig, filename='number-by-date')

## Find the longest article

In [None]:
# Which is the longest article(s)?
df[df['words'] == df['words'].max()]

## Make a simple word cloud

In [None]:
# Get all the articles titles and turn them into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [None]:
from wordcloud import WordCloud

# Generate a word cloud image
wordcloud = WordCloud(width=1200, height=800).generate(title_text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

## Using TextBlob

In [None]:
from textblob import TextBlob
from operator import itemgetter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
blob = TextBlob(title_text)
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
pd.DataFrame(word_counts).style.format({1: '{:,}'}).bar(subset=[1], color='#d65f5f').set_properties(subset=[1], **{'width': '300px'})

## Mapping newspaper locations

This makes use of a spreadsheet file that maps Trove newspaper titles to locations. Once we've loaded the spreadsheet we can use it to locate all of the harvested articles.

In [None]:
# Url of the Trove places spreadshseet
trove_places = 'https://docs.google.com/spreadsheets/d/1rURriHBSf3MocI8wsdl1114t0YeyU0BVSXWeg232MZs/gviz/tq?tqx=out:csv&sheet=198244298'

# Open the CSV file with Pandas
place_df = pd.read_csv(trove_places)

In [None]:
# We're going to map the locations using ipyleaflet, a Python implementation of the popular Leaflet javascript library.
# Let's import what we need.
from ipyleaflet import Map, Marker, MarkerCluster

# Create the map
m = Map(center=(-28, 140), zoom=4)

# Loop through the results creating a marker for each article
markers = []
for row in df.itertuples(index=False):
    try:
        # Look up the newspaper identifier in the locations spreadsheet
        location = place_df.loc[place_df['title_id'] == row.newspaper_id].iloc[0]
    except IndexError:
        # There are Government Gzettes
        print('Not found: {}'.format(row.newspaper_id))
    marker = Marker(location=(float(location['latitude']), float(location['longitude'])))
    markers.append(marker)

marker_cluster = MarkerCluster(
    markers=markers
)

m.add_layer(marker_cluster);
m

## Explore article texts in Voyant Tools

First we need to zip up all the little text files for easy transport.

In [None]:
import zipfile

def zip_harvest_texts(timestamp=None):
    if not timestamp:
        timestamp = get_latest_harvest()
    data_dir = os.path.join('data', timestamp)
    texts_dir = os.path.join(data_dir, 'text')
    with zipfile.ZipFile(os.path.join(data_dir, '{}-texts.zip'.format(timestamp)), 'w', zipfile.ZIP_DEFLATED) as zip_file:
        text_files = [t for t in os.listdir(texts_dir) if t[-4:] == '.txt']
        for text_file in text_files:   
            zip_file.write(os.path.join(texts_dir, text_file), text_file)

zip_harvest_texts()

In [None]:
# Save titles to a text file
timestamp = get_latest_harvest()
df['title'].to_csv(os.path.join('data', timestamp, 'titles.txt'), index=False, index_label=False)