In [1]:
import pandas as pd
import json
import os
import re
from lxml import etree
import time
import pickle
os.chdir("..")

In [32]:
class ContentParser:
    def __init__(self, xml_str):
        self.tree = etree.fromstring(xml_str)
        self.nsmap = self.get_nsmap()
        # Parse
        self.title = self.get_title()
        self.author = self.get_author()
        self.publication_date = self.get_published()
        self.body_text = self.get_body_text()
        self.lang = self.get_language()

    def get_nsmap(self):
        nsmap = {}
        for ns in self.tree.xpath("//namespace::*"):
            if not ns[0] and ns[1] != "":
                nsmap["atom"] = ns[1]
            elif ns[0]:
                nsmap[ns[0]] = ns[1]
        return nsmap

    def get_title(self):
        title = self.tree.find("atom:title", namespaces=self.nsmap).text
        if not title:
            title_list = self.tree.xpath(".//nitf:hl1", namespaces=self.nsmap)
            title = title_list[0].text if len(title_list) > 0 else None
        return title

    def get_author(self):
        if self.tree.find(".//author/person/nameText") is not None:
            return self.tree.find(".//author/person/nameText").text
        else:
            return None

    def get_published(self):
        return self.tree.find("atom:published", namespaces=self.nsmap).text

    def get_body_text(self):
        text = None
        try:
            text = "\n\n".join(
                self.tree.find("atom:content/articleDoc", namespaces=self.nsmap)
                .find(".//bodyText")
                .itertext()
            )
        except Exception as e:
            print(e)
        return text

    def get_content_map(self):
        return {
            "title": self.title,
            "author": self.author,
            "publication_date": self.publication_date,
            "text": self.body_text,
            "lang": self.lang,
        }

    def get_language(self):
        return self.tree.find("atom:content/articleDoc", namespaces=self.nsmap).xpath(
            "./@xml:lang"
        )[0]


def parse_content(xml):
    content = ContentParser(xml_str=xml)
    return content.get_content_map()


#
# Helper functions
#


def json_to_dataframe(json_list, cols=None):
    """
    Converts json list of fetched results into dataframe and subsets to columns of interest

    params
    ------
    json_list:   list of dicts (JSON), data fetched from NU api
    cols:   optional, list of column names to subset returned dataframe on

    returns
    -------
    pandas dataframe
    """
    print("Converting json list to Pandas DataFrame...")
    df = pd.json_normalize(json_list)
    if cols:
        df = df.filter(items=cols)
    return df


def parse_results_df(results_df):
    """
    Extracts the body text from news articles in API search results
    """
    # Parse XML and append to df
    print("\nParsing results...")
    doc_cont_parsed = "Document.Content.Parsed"
    results_df[doc_cont_parsed] = results_df["Document.Content"].apply(parse_content)
    results_df = results_df.join(pd.json_normalize(results_df[doc_cont_parsed]))
    results_df.drop(columns=doc_cont_parsed, inplace=True)
    return results_df


def clean_lang_field_and_subset(results_df, lang="en"):
    print(f"Cleaning language field and subsetting df to language: {lang} ...")
    results_df["lang"] = results_df["lang"].str.lower()
    en_idx = results_df["lang"] == lang  # Subset to desired language
    return results_df.loc[en_idx]


def clean_doc_ids(results_df):
    print("Cleaning document ids...")
    results_df["Document.DocumentId"] = (
        results_df["Document.DocumentId"].str.split("contentItem:").str[1]
    )
    return results_df


def transform_pipeline(json_list):
    """
    Pipeline of transformations to perform on the fetched data
    """
    df = json_to_dataframe(
        json_list=json_list,
        cols=["ResultID", "Document.DocumentId", "Document.Content", "Source.Name"],
    )
    df = clean_doc_ids(results_df=df)
    df = parse_results_df(results_df=df)
    df = clean_lang_field_and_subset(results_df=df)
    # Add search string to results?
    return df


def load_json_list(path):
    print(f"Loading file: {path}")
    with open(path, "rb") as infile:
        json_list = json.load(infile)
    return json_list


def df_to_json(path, df):
    if not os.path.exists(path):
        print(f"JSON does not exist. Writing {path}...")
        try:
            df.to_json(path, orient="records", lines=True)
        except Exception as e:
            raise e
        print("JSON saved.")
    else:
        print("JSON already exists.")


In [33]:
files = [f for f in os.listdir("data/") if re.match("\w+-\w+\?.*\.json", f)]
json_list = load_json_list(path=os.path.join("data/", files[1]))
df = transform_pipeline(json_list=json_list)
df

Loading file: data/Ethiopia-Oromo?Date_gt_1991-01-01_and_Date_lt_1999-12-31.json
Converting json list to Pandas DataFrame...
Cleaning document ids...

Parsing results...
Cleaning language field and subsetting df to language: en ...


Unnamed: 0,Document.DocumentId,Document.Content,Source.Name,title,author,publication_date,text,lang
0,3SJF-45G0-005H-0450-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",The Associated Press,Ousted Marxists To Be Tried For War Crimes,,1992-09-03T00:00:00Z,Ethiopia plans to name a prosecutor this week ...,en
1,3TD9-9720-0091-H1VM-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",Africa Intelligence : Indian Ocean Newsletter,ETHIOPIA,,1993-12-25T00:00:00Z,The stance of the Ethiopian government toward...,en
2,3RFX-6KG0-00BT-M03H-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",Associated Press International,Ethiopian rebels enter Somalia,HAROUN HASSAN,1997-12-03T00:00:00Z,Several hundred Ethiopians from the Oromo Libe...,en
3,3SJB-4F30-0034-T419-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",TASS,TALKS ON SETTLEMENT IN ETHIOPIA OPEN IN LONDON,,1991-05-28T00:00:00Z,TALKS ON A PEACE SETTLEMENT OF THE CONFLICT IN...,en
4,3SJB-5740-0012-24V8-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",The Christian Science Monitor,"In New Ethiopia, Main Tribe Takes Peaceful Rou...",,1991-07-15T00:00:00Z,ETHNIC divisions run deep in Ethiopia. And the...,en
...,...,...,...,...,...,...,...,...
583,3SJ4-DND0-000G-J0DM-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",Federal News Service,prepared statement,,1994-07-29T00:00:00Z,"Mr. Chairman and members of the committee, I t...",en
584,3SJ4-G7S0-000G-J0G7-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",Federal News Service,HEARING OF THE AFRICA SUBCOMMITTEE OF THE,,1992-09-17T00:00:00Z,"REP. DYMALLY: Good morning, and welcome to the...",en
585,3SJD-NFH0-002D-R3HK-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",Department of State Dispatch,ETHIOPIA,,2022-06-12T13:58:40Z,A coalition of ethnic-based insurgencies topp...,en
586,3SJD-MXV0-002D-R0GY-00000-00,"<entry xmlns=""http://www.w3.org/2005/Atom""><id...",Department of State Dispatch,"Ethiopia Human Rights Practices, 1995",,2022-06-12T13:58:40Z,"Ethiopia Human Rights Practices, 1995\n\nEthio...",en
