# Training the Lbl2vec model

## Imports

In [5]:
import json
import pandas as pd
from lbl2vec import Lbl2Vec
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import strip_tags

## Creating Lbl2vec Data
Columns: Title, Description

### Get Keywords

In [2]:
labels = pd.read_csv("keywords.csv")

# split keywords by separator and save them as array
labels['keywords'] = labels['Synonyms'].apply(lambda x: x.split(';'))
del labels['Synonyms']

# convert description keywords to lowercase
labels['keywords'] = labels['keywords'].apply(lambda description_keywords: [keyword.lower() for keyword in description_keywords])

# get number of keywords for each class
labels['number_of_keywords'] = labels['keywords'].apply(lambda row: len(row))

# COMMENT OUT LATER
# labels["keywords"][0] = ["murder"]

labels.head()

Unnamed: 0,Subjects,URIs,keywords,number_of_keywords
0,Africa--Colonization,http://id.loc.gov/authorities/subjects/sh85001542,"[colonize, colonization, liberia, the]",4
1,Chinese Americans,http://id.loc.gov/authorities/subjects/sh85024244,"[chinese, asian, oriental, the]",4
2,Christianity and politics--United States,http://id.loc.gov/authorities/subjects/sh20081...,"[church, religion, christian, protestant, meth...",7
3,Constitutions--United States,http://id.loc.gov/authorities/subjects/sh20091...,"[constitution, amendment, declaration, bill, o...",7
4,Cotton trade,http://id.loc.gov/authorities/subjects/sh85033353,"[cotton, gin, picking, plantation, the]",5


### Read JSON

In [3]:
path = "../data/JSON_outputs/example_data.json"
f = open(path)
json_object = json.load(f)
issues = json_object["issues"]

### Get the Title/Text Dataframe

In [4]:
# Iterate through the json, get the texts and titles

texts = []
titles = []

for issue_number, issue in enumerate(issues):
    articles = issue["articles"]
    for article_number, article in enumerate(articles):
        text = article["ocr_text"]
        title = article["title"]
        texts.append(text)
        titles.append(title)

# Generate the dataframe

df = pd.DataFrame({
    "title": titles,
    "text": texts
})

# When previous step of the pipeline fails to get the title
# replace the title with the empty string

df = df.replace({
    "null": ""
})

df.head()

Unnamed: 0,title,text
0,,ulace. The only effectual way to refute false\...
1,"FOR SALE AT THIS OFFICE,\nPAMPHLET,","FOR SALE AT THIS OFFICE,\nPAMPHLET, entitled '..."
2,ANTI-SLAVERY PUBLICATIONS,ANTI-SLAVERY PUBLICATIONS.\nTHERE SOCIETY have...
3,SINGING SCHOOL,SINGING SCHOOL.\nTHE subscriber would most res...
4,,th\nto\nca\nm\nW\nve\n02\nW\nW\nso\na\n0\n74\n...


In [5]:
# df["text"][0]

### Tokenize Data

In [6]:
# doc: document text string
# returns tokenized document
# strip_tags removes meta tags from the text
# simple preprocess converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long 
# simple preprocess also removes numerical values as well as punktuation characters
def tokenize(doc):
    return simple_preprocess(strip_tags(doc), deacc=True, min_len=2, max_len=15)

In [7]:
# tokenize and tag documents combined title + description for Lbl2Vec training
df['tagged_docs'] = df.apply(lambda row: TaggedDocument(tokenize(row['title'] + '. ' + row['text']), [str(row.name)]), axis=1)

## Train and Save the Model

### Load Pre-trained Doc2vec

In [8]:
doc2vec_fpath = "enwiki_dbow/doc2vec.bin"
doc2vec_model = Doc2Vec.load(doc2vec_fpath)

doc2vec_model.wv.index_to_key

AttributeError: Can't get attribute 'DocvecsArray' on <module 'gensim.models.doc2vec' from '/Users/danieldelijani/Library/Python/3.8/lib/python/site-packages/gensim/models/doc2vec.py'>

### Train Model

In [13]:
# init model with parameters
lbl2vec_model = Lbl2Vec(keywords_list=list(labels['keywords']), doc2vec_model=doc2vec_model, label_names=list(labels['Subjects']))

['colonize', 'colonization', ' liberia', 'the']
['chinese', 'asian', 'oriental', 'the']
['church', 'religion', 'christian', 'protestant', 'methodist', 'preach', 'the']
['constitution', 'amendment', 'declaration', 'bill', 'of', 'rights', 'the']
['cotton', 'gin', 'picking', 'plantation', 'the']
['freedmen', 'manumission', 'christiana', 'the']
['runaway', 'fugitive', 'abduction', 'catcher', 'the']
['cherokee', 'choctaw', 'seminole', 'chickasaw', 'hopewell', 'removal', 'tribe', 'tribal', 'indian', 'the']
['miscegenation', 'interracial', 'race-mixing', 'amalgamation', 'the']
['abduction', 'fugitive', 'kidnap', 'catcher', 'snatch', 'the']
['hanging', 'lynch', 'mob', 'vigilante', 'the']
['nonviolence', 'pacifism', 'pacifists', 'the']
['quadroon', 'quarteron', 'octoroon', 'hexadecaroon', 'biracial', 'mulatto', 'the']
['rebellion', 'uprising', 'haiti', 'revolt', 'boukman', 'louverture', 'nat', 'turner', 'the']
['poll', 'ballot', 'vote', 'voting', 'literacy', 'test', 'election', 'the']

In [14]:
lbl2vec_model.fit()

2022-04-28 17:15:42,793 - Lbl2Vec - INFO - Train document and word embeddings


DOC2VEC ARGS: {'documents': 0     ([ulace, the, only, effectual, way, to, refute...
1     ([for, sale, at, this, office, pamphlet, for, ...
2     ([anti, slavery, publications, anti, slavery, ...
3     ([singing, school, singing, school, the, subsc...
4     ([th, to, ca, ve, so, ste, dreadful, consequen...
5     ([boston, august, th, mr, wm, garrison, dear, ...
6     ([webster, thus, defines, blasphemy, blas, phe...
7     ([for, the, liberator, baptist, association, t...
8     ([mr, birney, letter, the, editor, of, the, ly...
9     ([attempted, murder, and, suicide, colored, ma...
10    ([list, of, letters, list, of, letters, receiv...
11    ([church, action, on, slavery, church, action,...
12    ([the, question, of, abolition, is, warmly, ag...
13    ([from, the, new, world, the, episcopal, spiri...
14    ([from, the, new, world, the, jury, law, some,...
15    ([the, letter, of, dr, dionysius, lardner, bru...
16    ([anti, slavery, in, indiana, angola, steuben,...
17    ([friendly, ad

2022-04-28 17:15:46,853 - Lbl2Vec - INFO - Train label embeddings


asking for words: ['colonize', 'colonization', 'liberia', 'the']
using words: {'colonization', 'the'}
asking for words: ['chinese', 'asian', 'oriental', 'the']
using words: {'the'}
asking for words: ['church', 'religion', 'christian', 'protestant', 'methodist', 'preach', 'the']
using words: {'church', 'preach', 'the', 'christian', 'religion'}
asking for words: ['constitution', 'amendment', 'declaration', 'bill', 'of', 'rights', 'the']
using words: {'constitution', 'bill', 'declaration', 'of', 'the', 'rights'}
asking for words: ['cotton', 'gin', 'picking', 'plantation', 'the']
using words: {'the', 'plantation'}
asking for words: ['freedmen', 'manumission', 'christiana', 'the']
using words: {'the'}
asking for words: ['runaway', 'fugitive', 'abduction', 'catcher', 'the']
using words: {'the', 'fugitive', 'runaway'}
asking for words: ['cherokee', 'choctaw', 'seminole', 'chickasaw', 'hopewell', 'removal', 'tribe', 'tribal', 'indian', 'the']
using words: {'removal', 'the', 'tribe'}
asking for

In [None]:
# lbl2vec_model.save(".")

## Make predictions

In [15]:
model_docs_lbl_similarities = lbl2vec_model.predict_model_docs()

model_docs_lbl_similarities

2022-04-28 17:16:26,817 - Lbl2Vec - INFO - Get document embeddings from model
2022-04-28 17:16:26,821 - Lbl2Vec - INFO - Calculate document<->label similarities


In [16]:
new_docs_lbl_similarities = lbl2vec_model.predict_new_docs(tagged_docs=df['tagged_docs'])
new_docs_lbl_similarities

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,Africa--Colonization,Chinese Americans,Christianity and politics--United States,Constitutions--United States,Cotton trade,Freedmen,Fugitive slaves,Indians of North America,Interracial marriage,Kidnapping,Lynching,Nonviolence,Racially mixed people,Slave insurrections,Voting--United States
0,0,Indians of North America,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478,0.843478
1,1,Kidnapping,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938,0.930938
2,2,Fugitive slaves,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044,0.892044
3,3,Indians of North America,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551,0.688551
4,4,Christianity and politics--United States,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468,0.523468
5,5,Christianity and politics--United States,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022,0.735022
6,6,Slave insurrections,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167,0.875167
7,7,Racially mixed people,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596,0.701596
8,8,Indians of North America,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099,0.738099
9,9,Constitutions--United States,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984,0.648984
