# Data Cleaning and Processing
- This small data set uses only 1 folder from the NYT Corpus
- This is to make sure that the script works before scaling up to the entire corpus

In [1]:
# import libraries
import xml.etree.ElementTree as Et
import glob
import pandas as pd
import nltk

In [77]:
# create new dataframe with empty data
columns = ['Date', 'Month', 'Year', 'Name', 'Text']
data = pd.DataFrame(columns=columns)
data

Unnamed: 0,Date,Month,Year,Name,Text


In [78]:
# open each xml file in the specified folder, open it and print out the names of mentioned people
for file in glob.glob("../data/NYT Corpus/nyt_corpus/data/2007/01/01/*.xml"):
    # parse the xml file into an element tree to extract data
    tree = Et.parse(file)
    root = tree.getroot()
    
    # get publication date information
    date = root.find(".//meta[@name='publication_day_of_month']").attrib['content']
    month = root.find(".//meta[@name='publication_month']").attrib['content']
    year = root.find(".//meta[@name='publication_year']").attrib['content']
    
    # get article text information
    # some articles seem to lack text - this is caught and handled in the if/else
    article = root.find(".//block[@class='full_text']/p")
    if article is not None:
        text = (article.text).lower()
    else:
        text = None
        
    # for each person mentioned, create a new row of data for them in the dataframe    
    for c in root.iter('person'):
        name = str(c.text).upper()
        data = data.append([{'Date': date, 'Month': month, 'Year': year, 'Name': name, 'Text': text}])
data

Unnamed: 0,Date,Month,Year,Name,Text
0,1,1,2007,"BLUMENTHAL, MARTIN",blumenthal--martin. a new york business man an...
0,1,1,2007,"BRADLEY, CAROL L.","bradley--carol l., 84, of tinton falls, nj die..."
0,1,1,2007,"CRAWFORD, PERRY JR.","crawford--perry jr., died at 89 on december 13..."
0,1,1,2007,"FLOOD, ROBERT FRANCIS","flood--robert francis, husband of the late cat..."
0,1,1,2007,"GEISLER, ENID (FRIEDMAN)","geisler--enid (friedman), on december 29, 2006..."
0,1,1,2007,"GIUDICE, EMILY","giudice--emily, 94 of ridgewood, nj died decem..."
0,1,1,2007,"HIRSCH, TRUDE","hirsch--trude, born in vienna, austria. belove..."
0,1,1,2007,"KERRIGAN, MARGARET H. M. (MIMI)","kerrigan--margaret h. m. (mimi). december 28, ..."
0,1,1,2007,"KLEIN, ABRAHAM E., PH.D.","klein--abraham e., ph.d., professor emeritus, ..."
0,1,1,2007,"LONG, WILLIAM ALBERS OF POTOMAC, MD","long--william albers of potomac, md, died peac..."
