In [13]:
import pandas as pd
import json
import os
import re
from lxml import etree
import time
import pickle

#
# Data transformation utility classes
#


class ContentParser:
    def __init__(self, xml_str):
        self.tree = etree.fromstring(xml_str)
        self.nsmap = self.get_nsmap()
        # Parse
        self.title = self.get_title()
        self.author = self.get_author()
        self.publication_date = self.get_published()
        self.body_text = self.get_body_text()
        self.lang = self.get_language()

    def get_nsmap(self):
        nsmap = {}
        for ns in self.tree.xpath("//namespace::*"):
            if not ns[0] and ns[1] != "":
                nsmap["atom"] = ns[1]
            elif ns[0]:
                nsmap[ns[0]] = ns[1]
        return nsmap

    def get_title(self):
        title = self.tree.find("atom:title", namespaces=self.nsmap).text
        if not title:
            title_list = self.tree.xpath(".//nitf:hl1", namespaces=self.nsmap)
            title = title_list[0].text if len(title_list) > 0 else None
        return title

    def get_author(self):
        if self.tree.find(".//author/person/nameText") is not None:
            return self.tree.find(".//author/person/nameText").text
        else:
            return None

    def get_published(self):
        return self.tree.find("atom:published", namespaces=self.nsmap).text

    def get_body_text(self):
        text = None
        try:
            text = "\n\n".join(
                self.tree.find("atom:content/articleDoc", namespaces=self.nsmap)
                .find(".//bodyText")
                .itertext()
            )
        except Exception as e:
            print(e)
        return text

    def get_content_map(self):
        return {
            "title": self.title,
            "author": self.author,
            "publication_date": self.publication_date,
            "text": self.body_text,
            "lang": self.lang,
        }

    def get_language(self):
        return self.tree.find("atom:content/articleDoc", namespaces=self.nsmap).xpath(
            "./@xml:lang"
        )[0]


def parse_content(xml):
    content = ContentParser(xml_str=xml)
    return content.get_content_map()


#
# Helper functions
#


def json_to_dataframe(json_list, cols=None):
    """
    Converts json list of fetched results into dataframe and subsets to columns of interest

    params
    ------
    json_list:   list of dicts (JSON), data fetched from NU api
    cols:   optional, list of column names to subset returned dataframe on

    returns
    -------
    pandas dataframe
    """
    print("Converting json list to Pandas DataFrame...")
    df = pd.json_normalize(json_list)
    if cols:
        df = df.filter(items=cols)
    return df


def parse_results_df(results_df):
    """
    Extracts the body text from news articles in API search results
    """
    # Parse XML and append to df
    print("Parsing results...")
    doc_cont_parsed = "Document.Content.Parsed"
    results_df[doc_cont_parsed] = results_df["Document.Content"].apply(parse_content)
    results_df = results_df.join(pd.json_normalize(results_df[doc_cont_parsed]))
    results_df.drop(columns=doc_cont_parsed, inplace=True)
    return results_df


def clean_lang_field_and_subset(results_df, lang="en"):
    print(f"Cleaning language field and subsetting df to language: {lang} ...")
    results_df["lang"] = results_df["lang"].str.lower()
    en_idx = results_df["lang"] == lang  # Subset to desired language
    return results_df.loc[en_idx]


def clean_doc_ids(results_df):
    print("Cleaning document ids...")
    results_df["Document.DocumentId"] = (
        results_df["Document.DocumentId"].str.split("contentItem:").str[1]
    )
    return results_df


def drop_null_rows(df):
    n_rows = df[df['Document.Content'].isnull()].shape[0]
    print(f"Dropping {n_rows} rows with missing content...")
    return df[~df['Document.Content'].isnull()]
    

def drop_unnecessary_cols(df, cols=["Document.Content", "author", "lang"]):
    print("Removing unneeded columns...")
    return df.drop(columns=cols)


def sort_by_date(df, asc=True):
    print(f"Sorting documents by date, {'ascending' if asc else 'descending'}...")
    df["publication_date"] = pd.to_datetime(df["publication_date"])
    return df.sort_values(by="publication_date", ascending=asc)


def transform_pipeline(json_list):
    """
    Pipeline of transformations to perform on the fetched data
    """
    df = json_to_dataframe(
        json_list=json_list,
        cols=["ResultID", "Document.DocumentId", "Document.Content", "Source.Name"],
    )
    df = clean_doc_ids(results_df=df)
    df = drop_null_rows(df=df)
    df = parse_results_df(results_df=df)
    df = clean_lang_field_and_subset(results_df=df)
    df = drop_unnecessary_cols(df=df)
    df = sort_by_date(df=df)
    return df


def load_json_list(path):
    print(f"Loading file: {path}")
    with open(path, "rb") as infile:
        json_list = json.load(infile)
    return json_list


def df_to_json(path, df):
    print(f"JSON does not exist. Writing {path}...")
    try:
        df.to_json(path, orient="records", lines=True)
    except Exception as e:
        raise e
    print("JSON saved.")

In [3]:
os.chdir("..")

In [4]:
# Load data
files = [f for f in os.listdir("data/") if re.match("\w+-\w+\?.*\.json", f)]
json_list = load_json_list(path=os.path.join("data/", files[1]))

Loading file: data/Sudan-Southerns?Date_gt_2011-01-01_and_Date_lt_2015-12-31.json


In [14]:
df = transform_pipeline(json_list=json_list)

Converting json list to Pandas DataFrame...
Cleaning document ids...
Dropping 1 rows with missing content...

Parsing results...
Cleaning language field and subsetting df to language: en ...
Removing unneeded columns...
Sorting documents by date, ascending...


In [15]:
df

Unnamed: 0,Document.DocumentId,Source.Name,title,publication_date,text
4541,5CVT-F3F1-DY91-K3BH-00000-00,The Toronto Star,Trouble spots to watch as a new year begins,2011-01-01 00:00:00+00:00,"When conflicts explode onto world headlines, t..."
8990,51VC-YP71-DYRV-3467-00000-00,BBC Monitoring: International Reports,Sudanese presidential aide says Darfur peace t...,2011-01-01 00:00:00+00:00,Text of report in English by Sudanese governme...
14021,578J-SN91-DXHR-92JP-00000-00,defenceWeb,Sudan's president says he will accept referend...,2011-01-01 00:00:00+00:00,Sudanese\n\nPresident Omar al-Bashir \n\nsays ...
9431,51VK-1CS1-DYTJ-13VP-00000-00,The Canadian Press,A South Sudan vote for independence may make m...,2011-01-01 00:00:00+00:00,"AWEIL, Sudan _ Since fleeing violence in his n..."
2174,51W1-05C1-F11P-X46C-00000-00,Kuwait News Agency (KUNA),UN says all preparations in place for S. Sudan...,2011-01-01 00:00:00+00:00,All preparations are in place for South Sudan\...
...,...,...,...,...,...
29537,53CC-9WK1-JDPT-T4P5-00000-00,CQ Transcriptions,Libya: Key to the Caliphate Thrust Against Egy...,2022-06-19 04:01:52+00:00,THE CONSOLIDATION OF A SELF-PROCLAIMED Calipha...
29528,52B5-5FJ1-DY7X-Y1S4-00000-00,Federal News Service,The Visionaries 2012,2022-06-19 04:01:52+00:00,25 YEARS OF TRUTH IN TRAVEL\n\nA Chinese artis...
29591,5BTP-RXD1-F11P-X2XK-00000-00,Sudan Tribune,Men's Health The 100 Greatest Adventures on Ea...,2022-06-19 04:02:14+00:00,Adventure is medicine. Whether you're dangling...
29587,55FC-CT61-JDPT-T34J-00000-00,CQ Transcriptions,South Africa and Africa,2022-06-19 04:02:14+00:00,1. See .\n\n2. DIRCO said its mission was to p...
