In [1]:
import lxml.html
import html
from nltk import word_tokenize, sent_tokenize
from tqdm import tqdm
import pandas as pd
import gcld3
import os
import re

In [2]:
with open(os.path.join("data", "OJS_languages_disciplines.csv"), "r") as infile:
    ojs = pd.read_csv(infile)

In [3]:
ojsED = ojs[ojs["discipline"] == "Education"]
del ojs

In [4]:
ed_issns = ojsED["issn"].tolist() + ojsED["issn_alt"].tolist()
ed_issns = [issn for issn in ed_issns if issn]
print(f"{len(ed_issns)} education journal issns in the OJS data dump.")

5076 education journal issns in the OJS data dump.


In [5]:
path_to_datadump = os.path.join("data", "datadump.txt")

In [6]:
identifier = gcld3.NNetLanguageIdentifier(0, 1000)

In [7]:
docs = []
metadata_pattern = "<metadata>.+</metadata>"
    
with open(path_to_datadump, "r") as infile:
    
    for line in tqdm(infile):
        content = re.search(metadata_pattern, line, re.MULTILINE | re.DOTALL)
        
        if content:
            tree = lxml.html.fromstring(content.group())
            
            for child in tree.getchildren():
                ed_record = False
                
                for source in child.xpath(".//source"):
                    if source.text is not None:
                        if source.text.strip() in ed_issns:
                            ed_record = True
                            break
                        
                if ed_record:
                    try:
                        for title in child.xpath(".//title"):
                            if identifier.FindLanguage(title.text).language == "en":
                                docs.append(
                                    html.unescape(title.text)
                                )
                                
                        for desc in child.xpath(".//description"):
                            if identifier.FindLanguage(desc.text).language == "en":
                                docs.append(
                                    html.unescape(desc.text)
                                )
                    except:
                        continue 
                        
            while tree.getprevious() is not None:
                del tree.getparent()[0]
        del content
                        
print(f"{len(docs)} docs pulled from the data dump. Each is a title or description of an OJS Education article.")

27707it [52:12,  8.84it/s]

339649 docs pulled from the data dump. Each is a title or description of an OJS Education article.





In [10]:
path_to_txt = os.path.join("data", "ojs_docs.txt")

In [13]:
num_docs = 0
with open(path_to_txt, "w") as outfile:
    for doc in docs:
        for sent in sent_tokenize(doc):
            sent_clean = re.sub(r"\\n", " ", sent)
            sent_clean = re.sub(r"\\", "", sent_clean)
            outfile.write("%s\n" % (sent_clean,)) # Write each title or sentence to file
            num_docs += 1

In [14]:
print(f"{num_docs} docs saved as OJS topic model inputs. Each doc is a title or sentence describing an Ed article.")
del docs

1651014 docs saved as OJS topic model inputs. Each doc is a title or sentence describing an Ed article.


In [16]:
%%time

num_toks = 0
with open(path_to_txt, "r") as infile:
    for line in infile:
        num_toks += len(word_tokenize(line))

CPU times: user 3min 6s, sys: 299 ms, total: 3min 7s
Wall time: 3min 7s


In [17]:
print(f"{num_toks} tokens saved as OJS topic model inputs.")

43120464 tokens saved as OJS topic model inputs.
