In [234]:
# Libaries for parsing data
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import os

# Libraries from nltk
import nltk
import string
from nltk import collocations 
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords

# Library for plotting
import matplotlib.pyplot as plt

In [235]:
# Modify corpus to the dataset path
corpus = './data/Boston_Globe_Source/'
#corpus = './data/Chicago_Tribune_Source/'
#corpus = './data/Los_Angeles_Times_Source/'
#corpus = './data/The_Washington_Post_Source/'
#corpus = './data/New_York_Times_Source/'
#corpus = './data/The_Wall_Street_Journal_Source/'
# Read in files
input_files = os.listdir(corpus)

In [236]:
# Select the number of articles to sample
sample_size = 100

# Generate a sample of articles
try:
    #sample_input_files = input_files[0:sample_size]
    sample_input_files = input_files

except ValueError:
    sample_input_files = input_files
    
print("Currently sampling", len(sample_input_files), "documents.")
print(input_files)

Currently sampling 1703 documents.
['2667058638.xml', '2152088797.xml', '2418789818.xml', '1811958975.xml', '1974509364.xml', '2460562215.xml', '1816413153.xml', '1796411889.xml', '1779826223.xml', '2633558148.xml', '2649820622.xml', '2701235704.xml', '2297159505.xml', '2694615000.xml', '1974668519.xml', '2329344451.xml', '1634393092.xml', '2584432988.xml', '2202084219.xml', '1974672336.xml', '1974660904.xml', '2205451625.xml', '2225977292.xml', '2437865582.xml', '2349191081.xml', '2398550670.xml', '1790227394.xml', '1974491944.xml', '2176522685.xml', '2232147322.xml', '1768397354.xml', '2131719756.xml', '2477131339.xml', '2743864346.xml', '2637120523.xml', '2443061969.xml', '1515458240.xml', '1974438883.xml', '1766897610.xml', '2635873015.xml', '1974576146.xml', '2019729332.xml', '2066445584.xml', '1974588446.xml', '1640598023.xml', '1512815456.xml', '1947356470.xml', '2495540093.xml', '1961490863.xml', '2555715498.xml', '1867205849.xml', '2543873334.xml', '2471114524.xml', '192814428

In [237]:
# Function to strip html tags from text portion
def strip_html_tags(text):
    stripped = BeautifulSoup(text).get_text().replace('\n', ' ').replace('\\', '').strip()
    return stripped

In [238]:
# Retrieve metadata from XML document
def getxmlcontent(corpus, file, strip_html=True):
    try:
        tree = etree.parse(corpus + file)
        root = tree.getroot()

        if root.find('.//GOID') is not None:
            goid = root.find('.//GOID').text
        else:
            goid = None

        if root.find('.//Title') is not None:
            title = root.find('.//Title').text
        else:
            title = None

        if root.find('.//NumericDate') is not None:
            date = root.find('.//NumericDate').text
        else:
            date = None
            
        if root.find('.//PublisherName') is not None:
            publisher = root.find('.//PublisherName').text
        else:
            publisher = None

        if root.find('.//FullText') is not None:
            text = root.find('.//FullText').text

        elif root.find('.//HiddenText') is not None:
            text = root.find('.//HiddenText').text

        elif root.find('.//Text') is not None:
            text = root.find('.//Text').text

        else:
            text = None

        # Strip html from text portion
        if text is not None and strip_html == True:
            text = strip_html_tags(text)
    
    except Exception as e:
        print(f"Error while parsing file {file}: {e}")
    
    return goid, title, date, publisher, text

In [239]:
# Create lists to store GOIDs, texts, and dates
goid_list = []
text_list = []
date_list = []

for file in sample_input_files:
    
    goid, title, date, publisher, text = getxmlcontent(corpus, file, strip_html=True)
    
    if text is not None:
        goid_list.append(goid)
        text_list.append(text)
        date_list.append(date)

In [240]:
# Create a dataframe, setting each of the columns to one of the lists we made in the cell above
df = pd.DataFrame({'GOID': goid_list, 'Text': text_list, 'Date': date_list})

In [241]:
print(len(df['Text'][0]))

8330


In [242]:
nltk.data.path.append('../Resources/nltk_data/')

In [243]:
import re

In [244]:
# Function to find contextual neighborhood
def keyword_neighbor_counter(keyword, text, window_size, freq):

    tknr = WhitespaceTokenizer() 
#     stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
#     word_tokens = word_tokenize(text)
#     filtered_sentence = [w for w in word_tokens if not w in stop_words]
    #with no lower case conversion
#     filtered_sentence = []

#     for w in word_tokens:
#         if w not in stop_words:
#             filtered_sentence.append(w)
            
#     print(filtered_sentence)
    word = r"\W*([\w]+)"
    grps = re.search(r'{}\W*{}{}'.format(word*window_size,keyword,word*window_size), text)
    if grps is None:
        return None
    else:
        groups = grps.groups()
        return groups[:window_size],groups[window_size:]

In [245]:
# Define keyword
keyword = 'gentrification of'

In [246]:
# Define window size
window_size = 3

# Define frequency
frequency = 5

In [247]:
df["left_context"] = ""
df["right_context"] = ""
df["Sentences"] = ""
for idx in df.index:
    text = str(df["Text"][idx])
    if keyword in text:
#         print("has gentrification of ", df["GOID"][idx])
        
#         print(text)
        
        # Use function to gather the contextual neighborhood
        left, right = keyword_neighbor_counter(keyword, text, window_size=window_size, freq=frequency)
        df["left_context"][idx]=left
        df["right_context"][idx]=right
        df["Sentences"][idx] = re.findall(r"([^.]*?gentrification of[^.]*\.)",text)
        print(df["left_context"][idx])

('coal', 'mine', 'the')
('lead', 'to', 'further')
('been', 'called', 'the')
('tributary', 'brewing', 'the')
('affordable', 'housing', 'the')
('fries', 'is', 'the')
('battling', 'the', 'relentless')
('lynn', 'because', 'the')
('he', 'said', 'the')
('step', 'in', 'the')
('battling', 'the', 'gentrification')
('the', 'table', 'the')
('expressing', 'concern', 'about')
('thus', 'began', 'the')
('dealing', 'with', 'the')
('aligned', 'with', 'the')
('the', 'table', 'the')
('battling', 'the', 'relentless')
('fries', 'is', 'the')
('housing', 'the', 'gentrification')
('more', 'specifically', 'the')
('80s', 'the', 'hellbent')
('town', 'noting', 'the')
('increase', 'we', 'see')
('units', 'could', 'hasten')
('further', 'accelerating', 'the')
('to', 'increased', 'rents')
('accelerating', 'the', 'gentrification')
('town', 'noting', 'the')
('participating', 'in', 'the')
('further', 'accelerating', 'the')
('lynn', 'because', 'the')
('for', 'the', 'gentrification')
('battling', 'the', 'relentless')
('the

In [248]:
#print("left context : ", df["left_context"])
#print("right context : ", df["right_context"])
print("sentences: ", df["Sentences"])

sentences:  0                                                        
1                                                        
2                                                        
3       [ The gentrification of Mystic is marked by th...
4                                                        
                              ...                        
1698                                                     
1699                                                     
1700                                                     
1701                                                     
1702                                                     
Name: Sentences, Length: 1703, dtype: object


In [249]:
df = df[df["left_context"] != ""]

In [250]:
print(len(df["Sentences"]))

112


In [251]:
lefts = ' '.join(str(' '.join(x)) for x in df["left_context"])
print(lefts)
rights = ' '.join(str(' '.join(x)) for x in df["right_context"])
print(rights)

coal mine the lead to further been called the tributary brewing the affordable housing the fries is the battling the relentless lynn because the he said the step in the battling the gentrification the table the expressing concern about thus began the dealing with the aligned with the the table the battling the relentless fries is the housing the gentrification more specifically the 80s the hellbent town noting the increase we see units could hasten further accelerating the to increased rents accelerating the gentrification town noting the participating in the further accelerating the lynn because the for the gentrification battling the relentless the most rapid thus began the the relentless relentless symbol of the symbol of the the most rapid been called the penalty and fought jarbeau said for the most rapid fries is the displacement and the pointing to the to the ongoing tributary brewing the displacement and the with continued continued aligned with the break neighborhoods because c

In [252]:
from collections import Counter
lfreq = Counter(lefts.split()).most_common()
print("LEFTS: ",lfreq)
print("\n")
rfreq = Counter(rights.split()).most_common()
print("RIGHTS: ",rfreq)

LEFTS:  [('the', 86), ('gentrification', 14), ('to', 10), ('with', 9), ('said', 6), ('further', 5), ('battling', 5), ('relentless', 5), ('for', 5), ('most', 5), ('rapid', 5), ('of', 5), ('and', 5), ('at', 5), ('because', 4), ('he', 4), ('in', 4), ('table', 4), ('been', 3), ('called', 3), ('housing', 3), ('fries', 3), ('is', 3), ('expressing', 3), ('concern', 3), ('about', 3), ('thus', 3), ('began', 3), ('aligned', 3), ('80s', 3), ('hellbent', 3), ('noting', 3), ('could', 3), ('hasten', 3), ('accelerating', 3), ('increased', 3), ('rents', 3), ('jarbeau', 3), ('displacement', 3), ('pointing', 3), ('continued', 3), ('call', 3), ('that', 3), ('white', 3), ('courtesy', 3), ('deal', 3), ('look', 3), ('coal', 2), ('mine', 2), ('lead', 2), ('tributary', 2), ('brewing', 2), ('affordable', 2), ('lynn', 2), ('step', 2), ('town', 2), ('increase', 2), ('we', 2), ('see', 2), ('units', 2), ('participating', 2), ('symbol', 2), ('ongoing', 2), ('neighborhoods', 2), ('laments', 2), ('result', 2), ('belo

### Result : A Dataframe with GO_ID, Text, Date, Left, Right context (3 words) and the surrounding sentence

In [253]:
df.head(10)

Unnamed: 0,GOID,Text,Date,left_context,right_context,Sentences
3,1811958975,MYSTIC -- This one-time shipbuilding center ha...,2016-08-17,"(coal, mine, the)","(mystic, is, marked)",[ The gentrification of Mystic is marked by th...
28,2176522685,"CAMBRIDGE — For almost 40 years, Marcelle Harr...",2019-02-06,"(lead, to, further)","(a, neighborhood, once)",[ A lawyer for Harrison's relatives said they ...
47,2495540093,"ANALYSIS A blow to America, felt across the po...",2021-01-08,"(been, called, the)","(roxbury, roxbury, and)","[ As a councilor, Janey has faced criticism of..."
62,1974589383,Foreside merchants craft their own allure ► KI...,2015-03-08,"(tributary, brewing, the)","(the, foreside, has)",[ The gentrification of the Foreside has ample...
73,2245723510,"At the national level, top-tier Democratic can...",2019-06-16,"(affordable, housing, the)","(neighborhoods, and, the)","[ Affordable housing, the gentrification of ne..."
75,2082131653,What are Dunkin’s new treats like? We decided ...,2018-04-11,"(fries, is, the)","(the, churro, wrote)",[ “Dunkin’ Donuts ‘inventing’ donut fries is t...
79,1280627967,Jon Niedzwiecki has watched enough reality TV ...,2013-01-26,"(battling, the, relentless)","(their, neighborhood, follows)","[ The show, which the network has said is focu..."
99,1531853044,Judging from photos taken in recent days on th...,2014-06-04,"(lynn, because, the)","(the, south, boston)","[ In the scene, which was shot in Lynn because..."
120,2545306083,"FALMOUTH, Maine — One bright, brisk morning la...",2021-06-27,"(he, said, the)","(the, maine, coastline)",[ “The gentrification of the Maine coastline i...
125,1081392361,"On Thursday, the Boston licensing board approv...",2012-10-01,"(step, in, the)","(a, neighborhood, that)","[ For some, this will be seen as a symbolic ev..."


In [254]:
df.to_csv("Boston-Globe.csv")

In [259]:
for i in df["Sentences"]:
    print("Sentence : ", i)

Sentence :  [' The gentrification of Mystic is marked by the eateries Red 36 (red36ct.']
Sentence :  [" A lawyer for Harrison's relatives said they are likely to sell the property once they seize control, an outcome that could lead to further gentrification of a neighborhood once known for its rich community of immigrant, working-class families."]
Sentence :  [' As a councilor, Janey has faced criticism of what has been called the gentrification of Roxbury.']
Sentence :  [' The gentrification of the Foreside has ample room for ethnicity too: One of the area’s longer-running restaurants, Tulsi, offers award-winning Indian food from chef Rajesh Mandekar, while the Kittery Dance Hall, home to an arts organization offering classes and community projects, books several several shows each month featuring a global mix of performers.']
Sentence :  [' Affordable housing, the gentrification of neighborhoods, and the displacement of residents remain citywide concerns, even as Boston experiences i

In [263]:
df["Date"] = pd.to_datetime(df['Date'])


In [264]:
df.sort_values(by='Date')
df

Unnamed: 0,GOID,Text,Date,left_context,right_context,Sentences
3,1811958975,MYSTIC -- This one-time shipbuilding center ha...,2016-08-17,"(coal, mine, the)","(mystic, is, marked)",[ The gentrification of Mystic is marked by th...
28,2176522685,"CAMBRIDGE — For almost 40 years, Marcelle Harr...",2019-02-06,"(lead, to, further)","(a, neighborhood, once)",[ A lawyer for Harrison's relatives said they ...
47,2495540093,"ANALYSIS A blow to America, felt across the po...",2021-01-08,"(been, called, the)","(roxbury, roxbury, and)","[ As a councilor, Janey has faced criticism of..."
62,1974589383,Foreside merchants craft their own allure ► KI...,2015-03-08,"(tributary, brewing, the)","(the, foreside, has)",[ The gentrification of the Foreside has ample...
73,2245723510,"At the national level, top-tier Democratic can...",2019-06-16,"(affordable, housing, the)","(neighborhoods, and, the)","[ Affordable housing, the gentrification of ne..."
...,...,...,...,...,...,...
1622,928777474,Note: If you go . . . Fiercely proud of its t...,2012-03-18,"(he, laments, that)","(ten, displaces, the)","[ As a veteran of the South End, he is resigne..."
1626,2704031598,"LAURIE ESSIG Like gentrification, hetrificatio...",2022-08-08,"(of, white, gentrification)","(neighborhoods, that, were)",[ Many gay and lesbian spaces were the result ...
1677,2522400410,When Northeastern University filed plans in 20...,2021-05-06,"(displacement, and, the)","(lower, roxbury, on)",[ “I believe they're driving displacement and ...
1684,2667340908,"MR. FULLERTON, BETWEEN THE SHEETSPresented by ...",2022-05-22,"(look, at, the)","(detroit, in, a)",[ “It's a historical look at the gentrificatio...


In [None]:
#Do Entity Recognition and Frequencies of articles with gentrification v/s gentrification of