# Export a text corpus (dataset) as a single .csv using a Full Text noise function

This is a modified version of the Python sample script

## Import Libraries

In [17]:
# Libraries for parsing data
import os
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
import numpy as np

## Load and Sample Data

Depending on the size and vocabulary of the input dataset, runtime of this script may vary. To process the entire dataset, set `sample_size` to `len(input_files)`. Larger datasets can be run on the multiprocessing version of this script.

In [3]:
# Set corpus to the folder of files you want to use
# ACTION: modify corpus name to match dataset created in TDM Studio
corpus = '/home/ec2-user/SageMaker/data/aspartame/'

# Read in files
input_files = os.listdir(corpus)

In [6]:
# Select the number of articles to sample
# ACTION: modify with len(input_files) to include entire corpus
# sample_size = 100
sample_size = len(input_files)

# Create a sample of input filenames (or entire corpus)
try:
    sample_input_files = input_files[0:sample_size]

except ValueError:
    sample_input_files = input_files
    
print("Currently sampling", len(sample_input_files), "documents.")

Currently sampling 100 documents.


## Specify Output Filename

Define the `output_file` variable to the desired save location and file name. This variable will be used at the end of the script to save the processed data.

In [None]:
# Modify output_file to desired save name
output_file = 'test_python_output.csv'

## Gather Metadata Fields

This section will gather text fields from the articles and add them to lists that will be used to make a dataframe. By default, this script will collect article ID, title, and the publishing date of the articles.

NOTE: This version includes additional metadata including FullText

In [8]:
# Function to strip html tags from text portion
def strip_html_tags(text):
    stripped = BeautifulSoup(text).get_text().replace('\n', ' ').replace('\\', '').strip()
    return stripped

In [32]:
# Retrieve metadata from XML document
# ACTION: add logic to look for pubtitle, AND add as a returned value below
def getxmlcontent(corpus, file, strip_html=True):
    try:
        tree = etree.parse(corpus + file)
        root = tree.getroot()

        if root.find('.//GOID') is not None:
            goid = root.find('.//GOID').text
        else:
            goid = None

        if root.find('.//Title') is not None:
            title = root.find('.//Title').text
        else:
            title = None

        if root.find('.//NumericDate') is not None:
            date = root.find('.//NumericDate').text
        else:
            date = None
            
        if root.find('.//PublisherName') is not None:
            publisher = root.find('.//PublisherName').text
        else:
            publisher = None
            
        if root.find('.//DFS/PubFrosting/Title') is not None:
            pubtitle = root.find('.//DFS/PubFrosting/Title').text
        else:
            pubtitle = None 

        if root.find('.//FullText') is not None:
            text = root.find('.//FullText').text

        elif root.find('.//HiddenText') is not None:
            text = root.find('.//HiddenText').text

        elif root.find('.//Text') is not None:
            text = root.find('.//Text').text

        else:
            text = None

        # Strip html from text portion
        if text is not None and strip_html == True:
            text = strip_html_tags(text)
    
    except Exception as e:
        print(f"Error while parsing file {file}: {e}")
    
    return goid, title, date, publisher, text, pubtitle

In [33]:
# Create lists to store article IDs, titles, dates,  text, and publisher
goid_list = []
title_list = []
date_list = []
text_list = []
publisher_list = []
pubtitle_list = []
year_list = []

# Parse files and add data to lists
for file in sample_input_files:
    
    # Retrieve the metadata
    goid, title, date, publisher, text, pubtitle = getxmlcontent(corpus, file, strip_html=True)

    # Create year var from date for easier filtering later
    year = date[0:4]
    
    # optional step to create a year var (from date_list) and filter b
    # list_of_years = [1924, 1934, 1944, 1954, 1964, 1974, 1984, 1994, 2004, 2014, 2024]    
    # if year in list_of_years:
    if True:
        # Store metadata to lists
        goid_list.append(goid)
        title_list.append(title)
        date_list.append(date)
        text_list.append(text)
        publisher_list.append(publisher)
        pubtitle_list.append(pubtitle)
        year_list.append(year)
        
    #implicitly, there's an empty "else" here. if the conditions above aren't met, we do nothing.
    

## Assemble lists of vars into single dataframe

This section uses the collected fields to make a dataframe.

NOTE: This is also more extensive than sample notebook

In [37]:
# Create a dataframe, setting each of the columns to one of the lists made in the cell above
df = pd.DataFrame({'GOID': goid_list,'Title': title_list, 'Date': date_list, 'Publisher' : publisher_list, "PubTitle": pubtitle_list, "Text": text_list, "Year": year_list})

In [38]:
# View dataframe
df

Unnamed: 0,GOID,Title,Date,Publisher,PubTitle,Text,Year
0,307816407,PARENTS MAY MOTIVATE A CHILD'S OVEREATING,1994-11-29,WP Company LLC d/b/a The Washington Post,The Washington Post (pre-1997 Fulltext),"Bribing, threatening or rewarding children t...",1994
1,426631736,Proposal for Nutrasweet,1987-10-26,New York Times Company,New York Times,LEAD: The Nutrasweet Company said Friday that ...,1987
2,428563824,Food Notes,1992-06-10,New York Times Company,New York Times,As One Dad Cooked It For a Father's Day gift ...,1992
3,290824130,SACCHARIN'S SWEET LIFE MAY GET 2 MORE YEARS,1985-04-18,"Tribune Publishing Company, LLC",Chicago Tribune (pre-1997 Fulltext),"Saccharin, an artificial sweetener whose safet...",1985
4,397970722,Genex Eliminates 16% Of Total Staff Positions,1984-12-04,Dow Jones & Company Inc.,Wall Street Journal,"ROCKVILLE, Md. -- Genex Corp. said it eliminat...",1984
...,...,...,...,...,...,...,...
95,1282415746,Coke and the Calorie Wars; We already know the...,2013-01-30,Dow Jones & Company Inc.,Wall Street Journal (Online),Most of the world has grown fatter since the 1...,2013
96,1925432336,Ponte a prueba: ¿cuáles de mis bebidas favorit...,2017-08-02,"Tribune Publishing Company, LLC",Chicago Tribune (Online),breakerbreakerbreakerbreakerbreakerbreakerbrea...,2017
97,2550962880,Coke Zero fans brace themselves as company ann...,2021-07-13,WP Company LLC d/b/a The Washington Post,The Washington Post (Online),There's another shake-up in the world of fizzy...,2021
98,292036446,Genteel Pause Cup of Tea: Americans Drink It Up,1985-01-02,Los Angeles Times Communications LLC,Los Angeles Times (pre-1997 Fulltext),"It's 7 a.m. Sharon Fraser, an Encino financial...",1985


In [45]:
df[df['Year'] == '1991']

Unnamed: 0,GOID,Title,Date,Publisher,PubTitle,Text,Year
31,283160133,Popping pills to ease a splitting headache may...,1991-04-03,"Tribune Publishing Company, LLC",Chicago Tribune (pre-1997 Fulltext),If you're one of the millions of Americans who...,1991
47,428000906,Court Ruling on Nutrasweet,1991-03-25,New York Times Company,New York Times,,1991
55,283159058,Excess of 1 mineral may squeeze out another,1991-12-16,"Tribune Publishing Company, LLC",Chicago Tribune (pre-1997 Fulltext),Q-I heard a discussion by a dietician of the p...,1991
84,398241871,Health: Label Rules to Foster Healthful Foods,1991-12-26,Dow Jones & Company Inc.,Wall Street Journal,WASHINGTON -- The unprecedented truth-in-label...,1991


## Noise function

In order to comply with ProQuest licensing rules, we **cannot** export original/unaltered Full Text values. We have two choices: (1) do our text analysis within the TDM Studio Workbench environment, or (2) create a dataset that only uses partial, derivative, or altered full text data.

In this notebook, I used a "noise" function that essentially rearranges paragraphs in the `Text` column. This retains our ability to use most text analysis methods, e.g. topic modeling, on a paragraph level, and allows us to comply with PQ licensing limits. You could also explore other noise functions here.

In [43]:
# Function to shuffle paragraphs
def shuffle_paragraphs(text):
    # deal with None/empty text values
    if text is None:
        return None
    paragraphs = text.split('\n\n')  # Split the text into paragraphs
    np.random.shuffle(paragraphs)    # Shuffle the paragraphs
    return '\n\n'.join(paragraphs)   # Join the paragraphs back into a single text block

# Apply the function to the 'Text' column
df['Text'] = df['Text'].apply(shuffle_paragraphs)

In [44]:
df[df['Text'].isnull()]

Unnamed: 0,GOID,Title,Date,Publisher,PubTitle,Text,Year
47,428000906,Court Ruling on Nutrasweet,1991-03-25,New York Times Company,New York Times,,1991
70,417903669,NutraSweet and Pepsi Reach Pact,1992-04-22,"Tribune Publishing Company, LLC",Chicago Tribune,,1992


## Save Dataframe as CSV

Make sure to change the `output_file` variable (defined at the top of script) to desired output file name before running this cell.

In [15]:
# Save output to file
df.to_csv(output_file)

In [50]:
!pwd

/home/ec2-user/SageMaker/Getting Started/2022.05.25/ProQuest TDM Studio Samples


## Export from Workbench environment

To export from Workbench (with limit of about 30MB/week), use shell commands to copy the dataset export from the steps above into the "drop folder" location. This triggers a hook/script on PQ's side that will email you the file directly.

Note you may have to do some investigating to set the exact dataset file path.

In [None]:
# replace with the dataset name you set above (or any other file you wish to export
# from the workbench environment)
data_to_export = 'test_python_output.csv'

Now run the shell script (with ! prefix) to move to a specific export folder. If you run into issues, it's likely a problem with the filepath above -- make sure your dataset is in the same folder as the Notebook you're running.

In [None]:
!aws s3 cp $data_to_export s3://pq-tdm-studio-results/tdm-ale-data/a1014/results/

upload: ./2024-02-23-nyc-2k-no-full.csv to s3://pq-tdm-studio-results/tdm-ale-data/a1014/results/2024-02-23-nyc-2k-no-full.csv


In [None]:
Note you can replace the filepath above