# This is a utility derived from my data_processing file to generate a pickled file of a years worth of data in a dataframe
This can take several minutes

In [1]:
# import libraries
import xml.etree.ElementTree as Et
import glob
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import matplotlib.pyplot as plt

In [2]:
# create new dataframe with empty data
columns = ['DOCID', 'Date', 'Month', 'Year', 'Name', 'Text']
data = pd.DataFrame(columns=columns)

In [3]:
# create year variable
# use this to select the year you want to save
year = "2007"

In [4]:
# open each xml file in the specified folder, open it and print out the names of mentioned people
for file in glob.glob("../data/NYT Corpus/nyt_corpus/data/" + year + "/*/*/*.xml"):
    # parse the xml file into an element tree to extract data
    tree = Et.parse(file)
    root = tree.getroot()
    
    # get document id information (not sure if I need this yet, seems like it could be helpful)
    docid = root.find('.//doc-id[@id-string]').attrib['id-string']
    
    # get publication date information
    date = root.find(".//meta[@name='publication_day_of_month']").attrib['content']
    month = root.find(".//meta[@name='publication_month']").attrib['content']
    year = root.find(".//meta[@name='publication_year']").attrib['content']
    
    # get article text information
    # some articles seem to lack text - this is caught and handled in the if/else
    article = root.find(".//block[@class='full_text']/p")
    if article is not None:
        text = (article.text).lower()
    else:
        text = None
        
    # for each person mentioned, create a new row of data for them in the dataframe    
    for c in root.iter('person'):
        name = str(c.text).upper()
        data = data.append([{'DOCID': docid, 'Date': date, 'Month': month, 'Year': year, 'Name': name, 'Text': text}])


In [5]:
data.head()

Unnamed: 0,DOCID,Date,Month,Year,Name,Text
0,1815718,1,1,2007,"BLUMENTHAL, MARTIN",blumenthal--martin. a new york business man an...
0,1815719,1,1,2007,"BRADLEY, CAROL L.","bradley--carol l., 84, of tinton falls, nj die..."
0,1815720,1,1,2007,"CRAWFORD, PERRY JR.","crawford--perry jr., died at 89 on december 13..."
0,1815721,1,1,2007,"FLOOD, ROBERT FRANCIS","flood--robert francis, husband of the late cat..."
0,1815722,1,1,2007,"GEISLER, ENID (FRIEDMAN)","geisler--enid (friedman), on december 29, 2006..."


In [6]:
data.tail()

Unnamed: 0,DOCID,Date,Month,Year,Name,Text
0,1855658,19,6,2007,"ABBAS, MAHMOUD (PRES)",so the masked men of fatah have the run of the...
0,1855660,19,6,2007,"BERGER, RUSS",i've been in the business of designing broadca...
0,1855660,19,6,2007,"RAYMOND, JOAN",i've been in the business of designing broadca...
0,1855660,19,6,2007,"BOLTON, MICHAEL",i've been in the business of designing broadca...
0,1855662,19,6,2007,"SHARKEY, JOE",i don't know if the air taxi model will work. ...


In [7]:
# PICKLE THE FILE TO SAVE IT
import pickle
pickle.dump(data, open("nyt-" + year + ".p", "wb"))