In [1]:
# we have (article text, article id) tuples for each neighborhood
# turn all articles into spacy-processible list of documents, but maintain the tuples
    # the result would be a tuple of a list of documents and the article id
# for each list, extract out all people, feed their last names to ethnicolr to predict races and then tag the result with the article id
# for each neighborhood, get percentage of races
# journalism team will compare results with U.S. Census data

In [2]:
import spacy
import pandas as pd
import en_core_web_md
import ast

In [3]:
# load medium English model in case we need to work with vectors
nlp = en_core_web_md.load()

In [4]:
df = pd.read_csv('Neighborhood_Separated_Articles/2014.csv')

In [5]:
black_neighborhoods = ['dorchester', 'roxbury', 'mattapan', 'hyde_park']
white_neighborhoods = ['fenway', 'beacon_hill', 'downtown', 'south_boston', 'east_boston', 'back_bay', 'jamaica_plain',
                      'south_end', 'charlestown', 'brighton', 'allston', 'west_end', 'roslindale', 'north_end',
                      'mission_hill', 'harbor_islands', 'longwood_medical_area', 'west_roxbury']
df = df.fillna("('no article', 'no_id')")
df['dorchester'] = df['dorchester'].apply(ast.literal_eval)
df['roxbury'] = df['roxbury'].apply(ast.literal_eval)
df['mattapan'] = df['mattapan'].apply(ast.literal_eval)
df['hyde_park'] = df['hyde_park'].apply(ast.literal_eval)
df['fenway'] = df['fenway'].apply(ast.literal_eval)
df['beacon_hill'] = df['beacon_hill'].apply(ast.literal_eval)
df['downtown'] = df['downtown'].apply(ast.literal_eval)
df['south_boston'] = df['south_boston'].apply(ast.literal_eval)
df['east_boston'] = df['east_boston'].apply(ast.literal_eval)
df['back_bay'] = df['back_bay'].apply(ast.literal_eval)
df['jamaica_plain'] = df['jamaica_plain'].apply(ast.literal_eval)
df['south_end'] = df['south_end'].apply(ast.literal_eval)
df['charlestown'] = df['charlestown'].apply(ast.literal_eval)
df['brighton'] = df['brighton'].apply(ast.literal_eval)
df['allston'] = df['allston'].apply(ast.literal_eval)
df['west_end'] = df['west_end'].apply(ast.literal_eval)
df['roslindale'] = df['roslindale'].apply(ast.literal_eval)
df['north_end'] = df['north_end'].apply(ast.literal_eval)
df['mission_hill'] = df['mission_hill'].apply(ast.literal_eval)
df['harbor_islands'] = df['harbor_islands'].apply(ast.literal_eval)
df['longwood_medical_area'] = df['longwood_medical_area'].apply(ast.literal_eval)
df['west_roxbury'] = df['west_roxbury'].apply(ast.literal_eval)

In [6]:
spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",",
                  "-",".","/",":",";","<", "=",">","?","@","[",
                  "\\","]","^","_", "`","{","|","}","~","–", 
                  "\xc2", "\xa0", "\x80", "\x9c", "\x99", "\x94", 
                  "\xad", "\xe2", "\x9d", "\n"]

df = df.drop(['Unnamed: 0'], axis=1)

#for char in spec_chars:
#    df['text'] = df['text'].str.strip()
#    df['text'] = df['text'].str.replace(char, ' ')
       
# access each column separately
for i in range(len(df.index)):
    for col in df.columns:
        for char in spec_chars:
            try:
                df.loc[i, col][0] = df.loc[i, col][0].str.strip()
                df.loc[i, col][0] = df.loc[i, col][0].str.replace(char, ' ')
            except:
                pass

In [7]:
df.shape

(500, 22)

In [8]:
articles = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in articles.keys():
    for i in range(df.shape[0]):
        if type(df.loc[i, sub_neighborhood]) == tuple:
            articles[sub_neighborhood].append((nlp(df.loc[i, sub_neighborhood][0]), df.loc[i, sub_neighborhood][1]))
    print(sub_neighborhood + ' DONE')

hyde_park DONE
beacon_hill DONE
south_boston DONE
jamaica_plain DONE
east_boston DONE
south_end DONE
back_bay DONE
north_end DONE
west_roxbury DONE
mission_hill DONE
harbor_islands DONE
west_end DONE
longwood_medical_area DONE
dorchester DONE
roxbury DONE
downtown DONE
fenway DONE
mattapan DONE
brighton DONE
charlestown DONE
roslindale DONE
allston DONE


In [9]:
articles['dorchester'][0][1]

'2014_5'

In [10]:
people = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}

for sub_neighborhood in articles.keys():
    for (doc, article_id) in articles[sub_neighborhood]:
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name = ent[0:2]
                sentence = ent.sent
                people[sub_neighborhood].append((name, sentence, article_id))

In [11]:
for sub_neighborhood in people.keys():
    list1 = people[sub_neighborhood]
    # insert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    people[sub_neighborhood] = unique_list

In [12]:
representation_proportions = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in people.keys():
    for i in range(len(people[sub_neighborhood])):
        if people[sub_neighborhood][i][0].text.strip() != '':
            temp = people[sub_neighborhood][i][0].text.split()
            if len(temp) > 1:
                people[sub_neighborhood][i] = (temp[-1], people[sub_neighborhood][i][1], people[sub_neighborhood][i][2])
            else:
                people[sub_neighborhood][i] = (temp[0], people[sub_neighborhood][i][1], people[sub_neighborhood][i][2])

In [13]:
people['dorchester'][0]

('Dwyer',
 ” the report said  Dwyer is due back in court next month  and Francois will be summonsed at a later date for arraignment on prostitution and lewdness charges  officials said  A working telephone number for Francois could not be located  Joe Pesaturo  a T spokesman  said Dwyer  a 26 year veteran  is suspended with pay and “facing severe disciplinary action ”,
 '2014_300')

In [23]:
from ethnicolr import pred_wiki_ln

In [24]:
#temp = pd.DataFrame(people[white_neighborhoods[2]], columns=['last_name', 'article', 'article_id'])
#temp1 = pd.DataFrame(people[white_neighborhoods[3]], columns=['last_name', 'article', 'article_id'])
#pd.concat([pred_census_ln(temp, 'last_name', 2010), pred_census_ln(temp1, 'last_name', 2010)], axis=0)

In [30]:
final_df = pd.DataFrame(columns=['last_name', 'article', 'article_id'])
sub_neighborhoods = white_neighborhoods + black_neighborhoods
sub_neighborhoods.remove('longwood_medical_area')
for col in sub_neighborhoods:
    temp = pd.DataFrame(people[col], columns=['last_name', 'article', 'article_id'])
    preds = pred_wiki_ln(temp, 'last_name')
    final_df = pd.concat([final_df, preds], axis=0)
    print(col + ' DONE')

fenway DONE
beacon_hill DONE
downtown DONE
south_boston DONE
east_boston DONE
back_bay DONE
jamaica_plain DONE
south_end DONE
charlestown DONE
brighton DONE
allston DONE
west_end DONE
roslindale DONE
north_end DONE
mission_hill DONE
harbor_islands DONE
west_roxbury DONE
dorchester DONE
roxbury DONE
mattapan DONE
hyde_park DONE


In [31]:
agg_df = pd.DataFrame(columns=sub_neighborhoods, index=['white', 'black', 'api', 'hispanic'])
agg_df = agg_df.fillna(0.0)
final_df = final_df.drop(['Asian,GreaterEastAsian,EastAsian', 'Asian,GreaterEastAsian,Japanese', 'Asian,IndianSubContinent', 'GreaterAfrican,Africans', 'GreaterAfrican,Muslim', 'GreaterEuropean,British', 'GreaterEuropean,EastEuropean', 'GreaterEuropean,Jewish', 'GreaterEuropean,WestEuropean,French','GreaterEuropean,WestEuropean,Germanic','GreaterEuropean,WestEuropean,Hispanic','GreaterEuropean,WestEuropean,Italian','GreaterEuropean,WestEuropean,Nordic'], axis=1)


In [32]:
for i in range(len(final_df.index)):
    if final_df.iloc[i]['race'] == 'Asian,GreaterEastAsian,EastAsian' or final_df.iloc[i]['race'] == 'Asian,GreaterEastAsian,Japanese' or final_df.iloc[i]['race'] == 'Asian,IndianSubContinent':
        final_df.iloc[i]['race'] = 'api'
    elif final_df.iloc[i]['race'] == 'GreaterEuropean,WestEuropean,Hispanic':
        final_df.iloc[i]['race'] = 'hispanic'
    elif final_df.iloc[i]['race'] == 'GreaterAfrican,Muslim' or final_df.iloc[i]['race'] == 'GreaterAfrican,Africans':
        final_df.iloc[i]['race'] = 'black'
    else:
        final_df.iloc[i]['race'] = 'white'

In [33]:
final_df.race.unique()

array(['white', 'hispanic', 'api', 'black'], dtype=object)

In [34]:
final_df.to_csv('People_Covered_in_the_News/people_2014.csv')

In [35]:
final_df.head()

Unnamed: 0,last_name,article,article_id,race
0,Coakley,"(Coakley, suffers, from, comparison, to, the, ...",2014_631,white
1,Ortiz,"(And, none, interfered, with, David, Ortiz, ,...",2014_1038,hispanic
2,Angela,"(My, prayers, go, out, to, the, mayor, ’s, lov...",2014_1057,white
3,Angela,"(I, send, our, prayers, to, Mayor, Menino, ’s,...",2014_1057,white
4,Barry,"(but, anecdotally, , many, of, those, infecte...",2014_778,white


In [18]:
# Part 1
# re-organize the data so that we have a way to retrieve original text
# like adding ID to the dataset to identify each article
# we should be able to find out the article a name comes from
# we should also be able to find out which neighborhood an article talks about

In [19]:
# Part 2
# If name has 'word', 'word', then take the first name
# keep sentence where name occurred, okay if multiple sentences
# look at sentence where the name was mentioned 
# and the words which were used
# end up with a dataset which has 'name' + 'sentence' + 'race'
# try to put ID of article in the dataset as well, next to the sentence
# for now, try to keep the row from which the name comes, or at least some form of ID

# if extra time, group sentences by associated race
# find most frequently used words for each race, maybe a word cloud or something

In [20]:
# Part 3
# come up with the population divide by races for each neighborhood
# use neighborhood-separated articles
# for each neighborhood, turn all articles into a spaCy-processible list of documents
# for each list, extract out all people and run their last names with ethnicolr to predict races
# for each set of predictions, get percentage of races
# have journalism team go through U.S. Census data to see if the proportions of races match Census data

# QUESTION: how to verify that two names talked about in an article belong to different people/the same people?

# potential solution: for each article, only store the unique names; but is this possible? 
# each doc is an article, so we can extract out all "PERSON" entities and then keep only those which are unique
# we could then feed the last names of those unique people (the last names may not necessarily be unique) to ethnicolr

In [21]:
# find percentage of race represented for each neighborhood DONE
# get unique names on an article level DONE

# use sentence dataset to create a word cloud DONE
# find all names that are black DONE
# find all words used most commonly to talk about black people DONE
# for every year DONE

# NEXT STEPS:
# Look through the represented race proportions for each of the functions, i.e. pred_census_ln, pred_wiki_ln, and pred_fl_reg_ln
# Pick the most seemingly accurate result
# Use the result to get the races of the people covered and then create the respective word clouds

In [22]:
# later goals:
# mention of race - through names - through association - mention of 
    # neighborhoods/organizations which are predominantly one race