In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Discovery

Before starting I need to understand the file type of the dataset, by listing all file in the directory and getting all file extension I can easily know the file type, therefore dealing with them accordingly.

In [2]:
# - os.listdir list all file in the directory
# - os.path.splittext divide the filename into filename and extension, it returns a tuple and 
# the second element is the extension
# - set returns the unique file extensions.
xml_list_files = os.listdir('/kaggle/input/nsf-research-awards-abstracts')
set([os.path.splitext(file)[1] for file in xml_list_files])

{'.xml'}

Since all files are XML, I'll be using ElemntTree from python that parse the file into a tree format to process and analyze it correctly.

The XML content starts at `rootTag` that nestes `Award`. It contains lot of information like the title, agency, award expiration date, award amount, abstract, and much more.

Since all this information not necessarly to classify all awards by topic, I'll be focusing in only three fields:

1. `AwardTitle`: This will serve as a summary on the abstract. *Hypothesis*: Provides context about the topic.
2. `Division`: The Division name could provide more information of the topic. *Hypothesis*: Each division will focus on knowledge areas.
3. `AbstractNarration`: The abstract of the award, this has the more useful information to clusterize.

In [3]:
tree = ET.parse('/kaggle/input/nsf-research-awards-abstracts/2000009.xml')
root = tree.getroot()
award = root.find("Award")
for child in award:
    print(child.tag)

AwardTitle
AGENCY
AwardEffectiveDate
AwardExpirationDate
AwardTotalIntnAmount
AwardAmount
AwardInstrument
Organization
ProgramOfficer
AbstractNarration
MinAmdLetterDate
MaxAmdLetterDate
ARRAAmount
TRAN_TYPE
CFDA_NUM
NSF_PAR_USE_FLAG
FUND_AGCY_CODE
AWDG_AGCY_CODE
AwardID
Investigator
Institution
Performance_Institution
ProgramElement
ProgramReference
ProgramReference
ProgramReference
Appropriation
Fund
FUND_OBLG
POR


The dataset contains more than 13 thousand documents. To optimize the consumptions I'll be using iterators. My first thought was to use recursive functions; however, python has a limitation in the maximum recursion depth.

On the other hand, iterators are lazy evaluators, being helpful to read a lot of files only when needed.

To be more memory efficient, first I'll split the `xml_list_files` into training and test set. After that, I'll create batches on training data, so the consumptions will be faster.

In [4]:
# Recursive function. Not used because of maximum recursion depth
# def parsing_docs(doc_list):
#     docs = []
#     directory = '/kaggle/input/nsf-research-awards-abstracts/'
    
#     # recursive case
#     if len(doc_list) > 0:
#         doc = doc_list[0]
#         full_dir = os.path.join(directory, doc)
#         award_title = ET.parse(full_dir).find("./Award/AwardTitle")
#         organization = ET.parse(full_dir).find("./Award/Organization")
#         abstract_narration = ET.parse(full_dir).find("./Award/AbstractNarration")
#         docs.append((award_title, organization, abstract_narration))
#         docs.extend(parsing_docs(doc_list[1:]))
        
#     return docs

In [5]:
from sklearn.model_selection import train_test_split

# Split the list into train and test
train_list, test_list = train_test_split(xml_list_files, test_size=0.33, shuffle=True)

In [6]:
class CreateTrainingBatches:
    
    def __init__(self, train_set, num_batches=4):
        self.__train_set = train_set
        # Define the batch size and the number of batches
        self.__num_batches = num_batches
        self.__train_size = len(self.__train_set)
        self.__batch_size = round(self.__train_size / self.__num_batches)
        
    def create_batches(self):
        # Yield an iterator
        for i in range(self.__num_batches):
            yield self.__train_set[i*self.__batch_size:(i+1)*self.__batch_size]

In [7]:
class XMLProcessor:
    
    def __init__(self, batches: iter):
        self.__batches = batches
    
    def filter_xml_element(self, file_path):
        tags_of_interest = {'AwardTitle','Organization','AbstractNarration'}
        context = ET.iterparse(file_path)
        return filter(lambda xml: xml[1].tag in tags_of_interest, context)

    def process_files(self, xml_file):
        return map(lambda file: self.filter_xml_element(file), xml_file)

    def prepare_files(self, lst):
        xml_dir = '/kaggle/input/nsf-research-awards-abstracts'
        return map(lambda file: os.path.join(xml_dir, file), lst)

    def get_text_elements(self, xml):
        if xml[1].tag == 'Organization':
            # Search for Division tag inside Organization
            division = [elem for elem in xml[1] if elem.tag == 'Division']
            # Inside Division, search to LongName (the name of the division)
            long_name = [elem for elem in division[0] if elem.tag == 'LongName']
            return long_name[0].text
        else:
            return xml[1].text

    def get_data_from_generator(self, data):
        return [map(lambda x: self.get_text_elements(x), elem) for elem in data]

    def convert_to_dataframe(self, text_data):
        return pd.DataFrame(
            data=[list(data) for data in text_data],
            columns=['AwardTitle','Division','AbstractNarration']
        )

    def process(self):
        try:
            lst = next(self.__batches)

            xml_files = self.prepare_files(lst)
            xml_data = self.process_files(xml_files)
            xml_text_data = self.get_data_from_generator(xml_data)
            return self.convert_to_dataframe(xml_text_data)
        except StopIteration:
            print("Iterator is empty")

In [8]:
batch_gen = CreateTrainingBatches(train_list)
it = batch_gen.create_batches()

xml_pr = XMLProcessor(it)
# This method will generate a new dataframe each time is called.
# This way I can explore a sample of 2 thousand records and, when needed, call the next 2 thounsand until the iterator is empty
# When the last one occurs, it will print a message.
df1 = xml_pr.process()

In [9]:
df1

Unnamed: 0,AwardTitle,Division,AbstractNarration
0,Assessing Students' Integration of Knowledge f...,Division Of Undergraduate Education,This project aims to serve the national intere...
1,Collaborative Research: Apparatus for Normaliz...,Division Of Physics,A central goal of nuclear and particle physics...
2,SBIR Phase I: Novel injectable long-acting lo...,Translational Impacts,The broader impact /commercial potential of th...
3,Collaborative Research: FoMR: Taming the Instr...,Division of Computing and Communication Founda...,Data centers are the power plants that drive t...
4,RAPID: Data-driven Multiscale Integrative Mode...,Division Of Chemistry,Gregory Voth of the University of Chicago is s...
...,...,...,...
2223,Processes Underlying the Rise of Social Comple...,Division Of Behavioral and Cognitive Sci,The goal of this project is to contribute to g...
2224,Millennials and Corporate Employment Practices,Divn Of Social and Economic Sciences,Millennials are now the largest generation in ...
2225,"FRG: Collaborative Research: Matroids, Graphs,...",Division Of Mathematical Sciences,Recent advances in matroid and graph theory fu...
2226,RAPID:NSF-BSF: Analysis of the spreading patte...,Division Of Environmental Biology,The recent COVID-19 pandemic has created an ur...
