In [5]:
import csv, os

clusters = [x for x in os.listdir(".") if x.startswith("exp2") and x.endswith("csv")]

from clustering_capitals import NewspaperArchive, ClusterDB
from burney_data import BurneyDB

n = NewspaperArchive("/datastore/burneytextareas")
bdb = BurneyDB("burney.db")

db = ClusterDB("1745_1756_with_linestarts.db")

for md in bdb.list_all_newspapers():
    print(md['title'], md['titleAbbreviation'])

ORIGINAL WEEKLY JOURNAL B0001ORIWEEJO
AYRE'S SUNDAY LONDON GAZETTE B0002SUNLONGA
AYRE'S SUNDAY LONDON GAZETTE AND WEEKLY MONITOR B0003SUNLONGA
DIE VENERIS 16 MAII 1645 BY THE LORDS AND COMMONS ASSEMBLED IN PARLIAMENT AT WESTMINSTER B0004LORDCOMM
E. JOHNSON'S BRITISH GAZETTE AND SUNDAY MONITOR B0007BRGASUMO
WILLIAMSON'S LIVERPOOL ADVERTISER AND MERCANTILE CHRONICLE B0010LIVEADVE
BRIEFE RELATION OF SOME AFFAIRES AND TRANSACTIONS B0011BRIEFREL
COLLECTION FOR IMPROVEMENT OF HUSBANDRY AND TRADE B0013COLLTRAD
COLLECTION OF THE SEVERAL ADDRESSES IN THE LATE KING JAMES'S TIME B0014COLLADDR
CONTINUATION OF CERTAIN SPECIALL AND REMARKABLE PASSAGES FROM BOTH HOUSES OF PARLIAMENT (COOKE AND WOOD) B0015CNTPSPRL
CONTINUATION OF CERTAIN SPECIALL AND REMARKABLE PASSAGES FROM BOTH HOUSES OF PARLIAMENT (COLES AND LEACH) B0016CNTINPRL
CONTINUATION OF OUR WEEKLY INTELLIGENCE FROM HIS MAJESTIES ARMY B0017CNTWKINT
PROCEEDINGS OF THE ARMY UNDER THE COMMAND OF SIR THOMAS FAIRFAX B0018CONTPROC
PARTICULAR RELAT

In [15]:
gold_sets = {"B0237GENEVEPO": "geneveningpost_utf8.csv", "B0911WESTJOUR": "westminsterjournal_utf8.csv"}

m_map = {"january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6,
         "july": 7, "august": 8, "september":9, "october":10, "november":11, "december":12}

def gold_set_md():
    for k,v in gold_sets.items():
        with open(v, "r") as csvfile:
            cdoc = csv.reader(csvfile)
            h = cdoc.__next__()
            year = ''
            for row in cdoc:
                md = {"titleAbbreviation": k}
                if row[5] != "":
                    md['day'] = "{0:02d}".format(int(row[5]))
                    md['month'] = "{0:02d}".format(m_map[row[6].lower()])
                    if row[7] != "":
                        md['year'] = row[7]
                        year = row[7]
                    elif row[4] != "":
                        md['year'] = row[4]
                        year = row[4]
                    else:
                        md['year'] = year
                else:
                    md['day'] = "{0:02d}".format(int(row[2]))
                    md['month'] = "{0:02d}".format(m_map[row[3].lower()])
                    if row[4] != "":
                        md['year'] = row[4]
                        year = row[4]
                    elif row[7] != "":
                        md['year'] = row[7]
                        year = row[7]
                    else:
                        md['year'] = year
                md['page'] = "{0:04d}".format(int(row[9]))
                md['column'] = row[10]
                yield md

def get_hash_set():
    gsethash = set()
    newspaperhash = set()
    for item in gold_set_md():
        gsethash.add(item['titleAbbreviation']+item['year']+item['month']+item['day']+item['page'])
        newspaperhash.add(item['titleAbbreviation']+item['year'])
    return gsethash, newspaperhash

def test_clusters(cluster_list):
    spread = {}
    nspread = {}
    cfile = {}
    for clusterfile in cluster_list:
        cno = int(clusterfile.split(".")[0][12:])
        cfile[cno] = clusterfile
        spread[cno] = 0
        nspread[cno] = 0
        gsethash, nhash = get_hash_set()
        with open(clusterfile, "r") as csf:
            cdoc = csv.DictReader(csf)
            for row in cdoc:
                hashv = row['titleAbbreviation']+row['year']+row['month']+row['day']+row['page']
                nhashv = row['titleAbbreviation']+row['year']
                if hashv in gsethash:
                    spread[cno] += 1
                if nhashv in nhash:
                    nspread[cno] += 1

    print("If clustering was random with respect to poems, this spread should be roughly equal in each cluster.")
    print("Coverage: (each hit = a block found in a cluster within a golden set marked page)")
    for cno in sorted(spread.keys()):
        if spread[cno] != 0 and nspread[cno] != 0:
            print("Cluster {0} - {1}/{2} ({3:.2f}%) hits/total blocks from 2 targetted newspapers in 1745".format(cfile[cno], spread[cno], nspread[cno], spread[cno]/nspread[cno]*100))
        else:
            print("Cluster {0} - {1}/{2} hits/total blocks (0%) from targetted newspapers+year".format(cfile[cno], spread[cno], nspread[cno]))

test_clusters(clusters)

If clustering was random with respect to poems, this spread should be roughly equal in each cluster.
Coverage: (each hit = a block found in a cluster within a golden set marked page)
Cluster exp2_cluster0.csv - 0/56 hits/total blocks (0%) from targetted newspapers+year
Cluster exp2_cluster1.csv - 0/10 hits/total blocks (0%) from targetted newspapers+year
Cluster exp2_cluster2.csv - 7/225 (3.11%) hits/total blocks from 2 targetted newspapers in 1745
Cluster exp2_cluster3.csv - 0/6 hits/total blocks (0%) from targetted newspapers+year
Cluster exp2_cluster4.csv - 0/0 hits/total blocks (0%) from targetted newspapers+year
Cluster exp2_cluster5.csv - 64/2438 (2.63%) hits/total blocks from 2 targetted newspapers in 1745
Cluster exp2_cluster6.csv - 0/22 hits/total blocks (0%) from targetted newspapers+year
Cluster exp2_cluster7.csv - 0/46 hits/total blocks (0%) from targetted newspapers+year
Cluster exp2_cluster8.csv - 8/641 (1.25%) hits/total blocks from 2 targetted newspapers in 1745
Cluster

Even though the cluster 5 response looks more relevent, there are more blocks of text from the two newspapers in that cluster than any other. This makes it a good but watered down response. Cluster 2 has more relevance. 

Of the blocks that definitely have poems on or around them, they are all in just 3 clusters. This is promising, as the 2 newspapers are present in all but one of the 12 clusters.

In [3]:
import pandas as pd
c5 = pd.read_csv('exp2_cluster5.csv', dtype=str)
c5.head()

Unnamed: 0,title,titleAbbreviation,year,month,day,issueNumber,printedDate,page,article,block_number,filepath,st_caps,st_nums,x1_var1,x2_var1,ltcount
0,UNIVERSAL SPECTATOR AND WEEKLY JOURNAL,B0848UNSPWEJO,1745,1,5,848,SATURDAY JANUARY 5 1745,1,1,1,/cygdrive/w/APEX/B0848UNSPWEJO/1745/01/05/service,0.4166666666666667,0.0,17702.061728395063,2082.131944444444,2713
1,UNIVERSAL SPECTATOR AND WEEKLY JOURNAL,B0848UNSPWEJO,1745,1,5,848,SATURDAY JANUARY 5 1745,1,1,3,/cygdrive/w/APEX/B0848UNSPWEJO/1745/01/05/service,0.4102564102564102,0.0,367.1452991452991,3908.794214332676,3254
2,UNIVERSAL SPECTATOR AND WEEKLY JOURNAL,B0848UNSPWEJO,1745,1,5,848,SATURDAY JANUARY 5 1745,2,2,0,/cygdrive/w/APEX/B0848UNSPWEJO/1745/01/05/service,0.5454545454545454,0.0454545454545454,1473.97520661157,42.58057851239669,1863
3,UNIVERSAL SPECTATOR AND WEEKLY JOURNAL,B0848UNSPWEJO,1745,1,5,848,SATURDAY JANUARY 5 1745,2,2,3,/cygdrive/w/APEX/B0848UNSPWEJO/1745/01/05/service,0.3333333333333333,0.0,21515.36507936508,2513.8004535147397,1637
4,UNIVERSAL SPECTATOR AND WEEKLY JOURNAL,B0848UNSPWEJO,1745,1,5,848,SATURDAY JANUARY 5 1745,2,2,6,/cygdrive/w/APEX/B0848UNSPWEJO/1745/01/05/service,0.5,0.0,9.0,1.0,3
