# Training the Lbl2vec model

## Imports

In [16]:
import json
import pandas as pd
from lbl2vec import Lbl2Vec
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import strip_tags

## Creating Lbl2vec Data
Columns: Title, Description

### Read JSON

In [2]:
path = "./data/JSON_outputs/example_data.json"
f = open(path)
json_object = json.load(f)
issues = json_object["issues"]

### Get the Title/Text Dataframe

In [28]:
# Iterate through the json, get the texts and titles

texts = []
titles = []

for issue_number, issue in enumerate(issues):
    articles = issue["articles"]
    for article_number, article in enumerate(articles):
        text = article["ocr_text"]
        title = article["title"]
        texts.append(text)
        titles.append(title)

# Generate the dataframe

df = pd.DataFrame({
    "title": titles,
    "text": texts
})

# When previous step of the pipeline fails to get the title
# replace the title with the empty string

df = df.replace({
    "null": ""
})

df.head()

Unnamed: 0,title,text
0,,ulace. The only effectual way to refute false\...
1,"FOR SALE AT THIS OFFICE,\nPAMPHLET,","FOR SALE AT THIS OFFICE,\nPAMPHLET, entitled '..."
2,ANTI-SLAVERY PUBLICATIONS,ANTI-SLAVERY PUBLICATIONS.\nTHERE SOCIETY have...
3,SINGING SCHOOL,SINGING SCHOOL.\nTHE subscriber would most res...
4,,th\nto\nca\nm\nW\nve\n02\nW\nW\nso\na\n0\n74\n...


### Tokenize Data

In [23]:
# doc: document text string
# returns tokenized document
# strip_tags removes meta tags from the text
# simple preprocess converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long 
# simple preprocess also removes numerical values as well as punktuation characters
def tokenize(doc):
    return simple_preprocess(strip_tags(doc), deacc=True, min_len=2, max_len=15)

In [29]:
# tokenize and tag documents combined title + description for Lbl2Vec training
df['tagged_docs'] = df.apply(lambda row: TaggedDocument(tokenize(row['title'] + '. ' + row['text']), [str(row.name)]), axis=1)

## Train and Save the Model

In [None]:
# init model with parameters
lbl2vec_model = Lbl2Vec(keywords_list=list(labels['keywords']), tagged_documents=df['tagged_docs'], label_names=list(labels['class_name']), similarity_threshold=0.30, min_num_docs=100, epochs=10)

In [None]:
lbl2vec_model.fit()

In [None]:
lbl2vec_model.save(".")