In [None]:
# This notebook computes and visualises some statistics on the OCR quality of the data 

# marieke.van.erp@dh.huc.knaw.nl
# 2 April 2018 


import pandas as pd
import glob
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from dateutil.parser import parse
from dateutil.tz import gettz

In [None]:
species = glob.glob('*tsv')

In [None]:
# All species in one go

import numpy as np 

np_array_list = []
for file_ in species:
    df = pd.read_csv(file_,index_col=None, header=0, sep="\t")
    np_array_list.append(df.as_matrix())

comb_np_array = np.vstack(np_array_list)
big_frame = pd.DataFrame(comb_np_array)

big_frame.head()


In [None]:
# Plot for comparing OCR over different newspaper titles 
# Figure 2-left in the paper 
sns.stripplot(x=2, y=6, data=big_frame, jitter=True)

In [None]:
# Create 10 13-year bins 
big_frame['binned'] = pd.cut(big_frame[2], 10, right=True)

big_frame.groupby('binned').count()



In [None]:
# Muck around with some colours and labels 
plt.figure(figsize=(10,5))

text = ["1799-1813", "1813-1827", '1827-1841', "1841-1855", "1855-1869","1869-1883", "1883-1897", "1897-1911", "1911-1925", "1925-1939"]

# plot the figure 
# Figure 2-right in the paper 
ax = sns.stripplot(x=big_frame['binned'], y=6, data=big_frame, jitter=True, palette=sns.light_palette("navy"))
ax.set_xticklabels(text, rotation='45', fontsize=10)
ax.set_xlabel('Date')
ax.set_ylabel('OCR Quality Score')
plt.show()

In [None]:
# Create 10 13-year bins 
big_frame['binned'] = pd.cut(big_frame[2], 14, right=True)

big_frame.groupby('binned').count()

# Muck around with some colours and labels 
plt.figure(figsize=(10,5))

text = ["1799-1809", "1809-1819", '1819-1829', "1829-1839", "1839-1849","1849-1859", "1859-1869", "1869-1879", "1879-1889", "1889-1899", "1899-1909", "1909-1919", "1919=1929", "1929-1939"]

# plot the figure 
# Figure 2-right in the paper 
ax = sns.stripplot(x=big_frame['binned'], y=6, data=big_frame, jitter=True, palette=sns.light_palette("navy"))
ax.set_xticklabels(text, rotation='45', fontsize=10)
ax.set_xlabel('Date')
ax.set_ylabel('OCR Quality Score')
plt.show()

In [None]:
big_frame.groupby('binned').count()