# Training the Lbl2vec model

## Imports

In [3]:
import json
import pandas as pd
from lbl2vec import Lbl2Vec
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import strip_tags

## Creating Lbl2vec Data
Columns: Title, Description

### Get Keywords

In [4]:
labels = pd.read_csv("keywords.csv")

# split keywords by separator and save them as array
labels['keywords'] = labels['Synonyms'].apply(lambda x: x.split(';'))
del labels['Synonyms']

# convert description keywords to lowercase
labels['keywords'] = labels['keywords'].apply(lambda description_keywords: [keyword.lower() for keyword in description_keywords])

# get number of keywords for each class
labels['number_of_keywords'] = labels['keywords'].apply(lambda row: len(row))

# COMMENT OUT LATER
# labels["keywords"][0] = ["murder"]

labels.head()

Unnamed: 0,Subjects,URIs,keywords,number_of_keywords
0,Africa--Colonization,http://id.loc.gov/authorities/subjects/sh85001542,"[colonize, colonization, liberia, the]",4
1,Chinese Americans,http://id.loc.gov/authorities/subjects/sh85024244,"[chinese, asian, oriental, the]",4
2,Christianity and politics--United States,http://id.loc.gov/authorities/subjects/sh20081...,"[church, religion, christian, protestant, meth...",7
3,Constitutions--United States,http://id.loc.gov/authorities/subjects/sh20091...,"[constitution, amendment, declaration, bill, o...",7
4,Cotton trade,http://id.loc.gov/authorities/subjects/sh85033353,"[cotton, gin, picking, plantation, the]",5


### Read JSON

In [5]:
path = "../data/JSON_outputs/data.json"
f = open(path)
json_object = json.load(f)
issues = json_object["issues"]

### Get the Title/Text Dataframe

In [6]:
# Iterate through the json, get the texts and titles

texts = []
titles = []

for issue_number, issue in enumerate(issues):
    articles = issue["articles"]
    for article_number, article in enumerate(articles):
        text = article["ocr_text"]
        title = article["title"]
        texts.append(text)
        titles.append(title)

# Generate the dataframe

df = pd.DataFrame({
    "title": titles,
    "text": texts
})

# When previous step of the pipeline fails to get the title
# replace the title with the empty string

df = df.replace({
    "null": ""
})

df.head()

Unnamed: 0,title,text
0,FOREIGN ITEMS.,"FOREIGN ITEMS.\nIt is stated from London, Ja. ..."
1,Fatal Temerity,"Fatal Temerity.--The Erie Observer states,\nth..."
2,,The new Lunatic Hospital in Worcester is nearl...
3,Cure for the Gout,"Cure for the Gout. Pray, Mr Abernethy,\nwhat i..."
4,,A few years\nBEWARE OF A DRUNKEN HUSBAND!\nOh ...


### Tokenize Data

In [8]:
# doc: document text string
# returns tokenized document
# strip_tags removes meta tags from the text
# simple preprocess converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long 
# simple preprocess also removes numerical values as well as punktuation characters
def tokenize(doc):
    return simple_preprocess(strip_tags(doc), deacc=True, min_len=2, max_len=15)

In [9]:
# tokenize and tag documents combined title + description for Lbl2Vec training
df['tagged_docs'] = df.apply(lambda row: TaggedDocument(tokenize(row['title'] + '. ' + row['text']), [str(row.name)]), axis=1)

## Train and Save the Model

### Understanding Parameters

In [10]:
# keywords_list

a = list(labels['keywords'])
for i, lst in enumerate(a):
    print(lst[:-1])

['colonize', 'colonization', 'liberia']
['chinese', 'asian', 'oriental']
['church', 'religion', 'christian', 'protestant', 'methodist', 'preach']
['constitution', 'amendment', 'declaration', 'bill', 'of', 'rights']
['cotton', 'gin', 'picking', 'plantation']
['freedmen', 'manumission', 'christiana']
['runaway', 'fugitive', 'abduction', 'catcher']
['cherokee', 'choctaw', 'seminole', 'chickasaw', 'hopewell', 'removal', 'tribe', 'tribal', 'indian']
['miscegenation', 'interracial', 'race-mixing', 'amalgamation']
['abduction', 'fugitive', 'kidnap', 'catcher', 'snatch']
['hanging', 'lynch', 'mob', 'vigilante']
['nonviolence', 'pacifism', 'pacifists']
['quadroon', 'quarteron', 'octoroon', 'hexadecaroon', 'biracial', 'mulatto']
['rebellion', 'uprising', 'haiti', 'revolt', 'boukman', 'louverture', 'nat', 'turner']
['poll', 'ballot', 'vote', 'voting', 'literacy', 'test', 'election']


In [11]:
# tagged documents

df['tagged_docs']

0     ([foreign, items, foreign, items, it, is, stat...
1     ([fatal, temerity, fatal, temerity, the, erie,...
2     ([the, new, lunatic, hospital, in, worcester, ...
3     ([cure, for, the, gout, cure, for, the, gout, ...
4     ([few, years, beware, of, drunken, husband, oh...
                            ...                        
95    ([new, england, anti, new, england, anti, slav...
96    ([george, putman, george, putman, hair, dresse...
97    ([cena, cena, john, pero, no, in, rear, of, do...
98    ([from, the, daily, advocate, from, the, daily...
99    ([their, rights, were, acknowledged, and, the,...
Name: tagged_docs, Length: 100, dtype: object

In [12]:
# label_names

list(labels['Subjects'])

['Africa--Colonization',
 'Chinese Americans',
 'Christianity and politics--United States',
 'Constitutions--United States',
 'Cotton trade',
 'Freedmen',
 'Fugitive slaves',
 'Indians of North America',
 'Interracial marriage',
 'Kidnapping',
 'Lynching',
 'Nonviolence',
 'Racially mixed people',
 'Slave insurrections',
 'Voting--United States']

In [13]:
# init model with parameters
lbl2vec_model = Lbl2Vec(keywords_list=list(labels['keywords']), tagged_documents=df['tagged_docs'], label_names=list(labels['Subjects'])) # , similarity_threshold=0.30, min_num_docs=100, epochs=1 

['colonize', 'colonization', ' liberia', 'the']
['chinese', 'asian', 'oriental', 'the']
['church', 'religion', 'christian', 'protestant', 'methodist', 'preach', 'the']
['constitution', 'amendment', 'declaration', 'bill', 'of', 'rights', 'the']
['cotton', 'gin', 'picking', 'plantation', 'the']
['freedmen', 'manumission', 'christiana', 'the']
['runaway', 'fugitive', 'abduction', 'catcher', 'the']
['cherokee', 'choctaw', 'seminole', 'chickasaw', 'hopewell', 'removal', 'tribe', 'tribal', 'indian', 'the']
['miscegenation', 'interracial', 'race-mixing', 'amalgamation', 'the']
['abduction', 'fugitive', 'kidnap', 'catcher', 'snatch', 'the']
['hanging', 'lynch', 'mob', 'vigilante', 'the']
['nonviolence', 'pacifism', 'pacifists', 'the']
['quadroon', 'quarteron', 'octoroon', 'hexadecaroon', 'biracial', 'mulatto', 'the']
['rebellion', 'uprising', 'haiti', 'revolt', 'boukman', 'louverture', 'nat', 'turner', 'the']
['poll', 'ballot', 'vote', 'voting', 'literacy', 'test', 'election', 'the']

In [14]:
lbl2vec_model.fit()

2022-05-04 15:46:02,433 - Lbl2Vec - INFO - Train document and word embeddings


DOC2VEC ARGS: {'documents': 0     ([foreign, items, foreign, items, it, is, stat...
1     ([fatal, temerity, fatal, temerity, the, erie,...
2     ([the, new, lunatic, hospital, in, worcester, ...
3     ([cure, for, the, gout, cure, for, the, gout, ...
4     ([few, years, beware, of, drunken, husband, oh...
                            ...                        
95    ([new, england, anti, new, england, anti, slav...
96    ([george, putman, george, putman, hair, dresse...
97    ([cena, cena, john, pero, no, in, rear, of, do...
98    ([from, the, daily, advocate, from, the, daily...
99    ([their, rights, were, acknowledged, and, the,...
Name: tagged_docs, Length: 100, dtype: object, 'epochs': 10, 'vector_size': 300, 'min_count': 0, 'window': 15, 'sample': 1e-05, 'negative': 5, 'workers': 4, 'hs': 1, 'dm': 0, 'dbow_words': 1}


2022-05-04 15:46:06,984 - Lbl2Vec - INFO - Train label embeddings


asking for words: ['colonize', 'colonization', 'liberia', 'the']
using words: {'colonization', 'the', 'liberia'}
asking for words: ['chinese', 'asian', 'oriental', 'the']
using words: {'the'}
asking for words: ['church', 'religion', 'christian', 'protestant', 'methodist', 'preach', 'the']
using words: {'christian', 'protestant', 'religion', 'preach', 'church', 'methodist', 'the'}
asking for words: ['constitution', 'amendment', 'declaration', 'bill', 'of', 'rights', 'the']
using words: {'declaration', 'rights', 'of', 'constitution', 'the'}
asking for words: ['cotton', 'gin', 'picking', 'plantation', 'the']
using words: {'picking', 'cotton', 'the'}
asking for words: ['freedmen', 'manumission', 'christiana', 'the']
using words: {'manumission', 'the'}
asking for words: ['runaway', 'fugitive', 'abduction', 'catcher', 'the']
using words: {'catcher', 'fugitive', 'the'}
asking for words: ['cherokee', 'choctaw', 'seminole', 'chickasaw', 'hopewell', 'removal', 'tribe', 'tribal', 'indian', 'the']

In [15]:
# lbl2vec_model.save(".")

## Make predictions

In [16]:
model_docs_lbl_similarities = lbl2vec_model.predict_model_docs()

2022-05-04 15:46:07,738 - Lbl2Vec - INFO - Get document embeddings from model
2022-05-04 15:46:07,744 - Lbl2Vec - INFO - Calculate document<->label similarities


In [17]:
model_docs_lbl_similarities

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,Africa--Colonization,Chinese Americans,Christianity and politics--United States,Constitutions--United States,Cotton trade,Freedmen,Fugitive slaves,Indians of North America,Interracial marriage,Kidnapping,Lynching,Nonviolence,Racially mixed people,Slave insurrections,Voting--United States
0,0,Africa--Colonization,0.709996,0.709996,0.428333,0.693960,0.566940,0.355575,0.707694,0.224068,0.589931,0.428333,0.224068,0.488490,0.428333,0.428333,0.301777,0.471014
1,1,Cotton trade,0.696382,0.404825,0.462857,0.455743,0.470591,0.696382,0.425603,0.385575,0.612112,0.462857,0.385575,0.591580,0.462857,0.462857,0.438228,0.434838
2,2,Indians of North America,0.672383,0.620828,0.505278,0.633313,0.590515,0.556339,0.669802,0.321214,0.672383,0.505278,0.321214,0.524023,0.505278,0.505278,0.498799,0.660515
3,3,Constitutions--United States,0.841666,0.674209,0.739377,0.728831,0.841666,0.815882,0.691609,0.730258,0.809734,0.739377,0.730258,0.832017,0.739377,0.739377,0.829521,0.758492
4,4,Slave insurrections,0.379634,0.150345,0.350662,0.190933,0.266263,0.332981,0.163883,0.336914,0.246277,0.350662,0.336914,0.268486,0.350662,0.350662,0.379634,0.334491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,Voting--United States,0.651611,0.444519,0.351769,0.469157,0.490843,0.339678,0.538170,0.163099,0.435109,0.351769,0.163099,0.370413,0.351769,0.351769,0.489355,0.651611
96,96,Freedmen,0.733368,0.713781,0.463116,0.694496,0.651753,0.429786,0.733368,0.194129,0.673136,0.463116,0.194129,0.485669,0.463116,0.463116,0.485552,0.711469
97,97,Cotton trade,0.574762,0.323989,0.379717,0.402667,0.444049,0.574762,0.374659,0.364397,0.528508,0.379717,0.364397,0.461816,0.379717,0.379717,0.390239,0.358193
98,98,Cotton trade,0.431116,0.202127,0.249510,0.251978,0.355965,0.431116,0.218819,0.308520,0.388936,0.249510,0.308520,0.395357,0.249510,0.249510,0.375792,0.261016


In [29]:
# Get top three most similar labels for each row

Subjects = list(labels["Subjects"])

top3_df = model_docs_lbl_similarities[Subjects].apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=3)

In [30]:
model_docs_lbl_similarities["top3"] = top3_df.apply(lambda row: list(row), axis=1)

In [33]:
# Iterate through the JSON, and write the predictions to the JSON object

for issue_number, issue in enumerate(issues):
    articles = issue["articles"]
    for article_number, article in enumerate(articles):
        subjects = model_docs_lbl_similarities["top3"][article_number]
        issues[issue_number]["articles"][article_number]["subjects"] = subjects

In [34]:
# Show the predictions

for issue_number, issue in enumerate(issues):
    articles = issue["articles"]
    for article_number, article in enumerate(articles):
        subjects = article["subjects"]
        print(subjects)

['Africa--Colonization', 'Freedmen', 'Christianity and politics--United States']
['Cotton trade', 'Indians of North America', 'Lynching']
['Indians of North America', 'Freedmen', 'Voting--United States']
['Constitutions--United States', 'Lynching', 'Slave insurrections']
['Slave insurrections', 'Chinese Americans', 'Interracial marriage']
['Freedmen', 'Christianity and politics--United States', 'Africa--Colonization']
['Voting--United States', 'Slave insurrections', 'Constitutions--United States']
['Africa--Colonization', 'Freedmen', 'Indians of North America']
['Slave insurrections', 'Cotton trade', 'Fugitive slaves']
['Cotton trade', 'Lynching', 'Fugitive slaves']
['Lynching', 'Constitutions--United States', 'Indians of North America']
['Slave insurrections', 'Fugitive slaves', 'Kidnapping']
['Fugitive slaves', 'Kidnapping', 'Slave insurrections']
['Voting--United States', 'Africa--Colonization', 'Freedmen']
['Slave insurrections', 'Voting--United States', 'Constitutions--United Stat

In [37]:
out_path = "../data/JSON_outputs/example__final_data.json"
with open(out_path, "w") as f:
    json.dump(json_object, f)