In [1]:
import spacy
import pandas as pd
import en_core_web_md
import ast

In [2]:
# load medium English model in case we need to work with vectors
nlp = en_core_web_md.load()

In [3]:
df = pd.read_csv('Neighborhood_Separated_Articles/2018.csv')

In [4]:
black_neighborhoods = ['dorchester', 'roxbury', 'mattapan', 'hyde_park']
white_neighborhoods = ['fenway', 'beacon_hill', 'downtown', 'south_boston', 'east_boston', 'back_bay', 'jamaica_plain',
                      'south_end', 'charlestown', 'brighton', 'allston', 'west_end', 'roslindale', 'north_end',
                      'mission_hill', 'harbor_islands', 'longwood_medical_area', 'west_roxbury']
df = df.fillna("('no article', 'no_id')")
df['dorchester'] = df['dorchester'].apply(ast.literal_eval)
df['roxbury'] = df['roxbury'].apply(ast.literal_eval)
df['mattapan'] = df['mattapan'].apply(ast.literal_eval)
df['hyde_park'] = df['hyde_park'].apply(ast.literal_eval)
df['fenway'] = df['fenway'].apply(ast.literal_eval)
df['beacon_hill'] = df['beacon_hill'].apply(ast.literal_eval)
df['downtown'] = df['downtown'].apply(ast.literal_eval)
df['south_boston'] = df['south_boston'].apply(ast.literal_eval)
df['east_boston'] = df['east_boston'].apply(ast.literal_eval)
df['back_bay'] = df['back_bay'].apply(ast.literal_eval)
df['jamaica_plain'] = df['jamaica_plain'].apply(ast.literal_eval)
df['south_end'] = df['south_end'].apply(ast.literal_eval)
df['charlestown'] = df['charlestown'].apply(ast.literal_eval)
df['brighton'] = df['brighton'].apply(ast.literal_eval)
df['allston'] = df['allston'].apply(ast.literal_eval)
df['west_end'] = df['west_end'].apply(ast.literal_eval)
df['roslindale'] = df['roslindale'].apply(ast.literal_eval)
df['north_end'] = df['north_end'].apply(ast.literal_eval)
df['mission_hill'] = df['mission_hill'].apply(ast.literal_eval)
df['harbor_islands'] = df['harbor_islands'].apply(ast.literal_eval)
df['longwood_medical_area'] = df['longwood_medical_area'].apply(ast.literal_eval)
df['west_roxbury'] = df['west_roxbury'].apply(ast.literal_eval)

In [5]:
spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",",
                  "-",".","/",":",";","<", "=",">","?","@","[",
                  "\\","]","^","_", "`","{","|","}","~","–", 
                  "\xc2", "\xa0", "\x80", "\x9c", "\x99", "\x94", 
                  "\xad", "\xe2", "\x9d", "\n"]

df = df.drop(['Unnamed: 0'], axis=1)

#for char in spec_chars:
#    df['text'] = df['text'].str.strip()
#    df['text'] = df['text'].str.replace(char, ' ')
       
# access each column separately
for i in range(len(df.index)):
    for col in df.columns:
        for char in spec_chars:
            try:
                df.loc[i, col][0] = df.loc[i, col][0].str.strip()
                df.loc[i, col][0] = df.loc[i, col][0].str.replace(char, ' ')
            except:
                pass

In [6]:
df.shape

(1783, 22)

In [7]:
articles = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in articles.keys():
    for i in range(df.shape[0]):
        if type(df.loc[i, sub_neighborhood]) == tuple:
            articles[sub_neighborhood].append((nlp(df.loc[i, sub_neighborhood][0]), df.loc[i, sub_neighborhood][1]))
    print(sub_neighborhood + ' DONE')

hyde_park DONE
beacon_hill DONE
south_boston DONE
jamaica_plain DONE
east_boston DONE
south_end DONE
back_bay DONE
north_end DONE
west_roxbury DONE
mission_hill DONE
harbor_islands DONE
west_end DONE
longwood_medical_area DONE
dorchester DONE
roxbury DONE
downtown DONE
fenway DONE
mattapan DONE
brighton DONE
charlestown DONE
roslindale DONE
allston DONE


In [8]:
articles['dorchester'][0][1]

'2018_3'

In [9]:
people = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}

for sub_neighborhood in articles.keys():
    for (doc, article_id) in articles[sub_neighborhood]:
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name = ent[0:2]
                # sentence = ent.sent
                people[sub_neighborhood].append((name, article_id))

In [10]:
for sub_neighborhood in people.keys():
    list1 = people[sub_neighborhood]
    # convert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    people[sub_neighborhood] = unique_list

In [11]:
representation_proportions = {'hyde_park': [], 'beacon_hill': [], 'south_boston': [], 'jamaica_plain': [], 'east_boston': [],
                'south_end': [], 'back_bay': [], 'north_end': [], 'west_roxbury': [], 'mission_hill': [],
                'harbor_islands': [], 'west_end': [], 'longwood_medical_area': [],
                'dorchester': [], 'roxbury': [], 'downtown': [], 'fenway': [], 'mattapan': [], 'brighton': [],
                'charlestown': [], 'roslindale': [], 'allston': []}
for sub_neighborhood in people.keys():
    for i in range(len(people[sub_neighborhood])):
        if people[sub_neighborhood][i][0].text.strip() != '':
            temp = people[sub_neighborhood][i][0].text.split()
            if len(temp) > 1:
                people[sub_neighborhood][i] = (temp[0], people[sub_neighborhood][i][1])
            else:
                people[sub_neighborhood][i] = (temp[0], people[sub_neighborhood][i][1])

In [12]:
people['dorchester'][0]

('Brandon', '2018_3873')

In [13]:
from ethnicolr import pred_census_ln

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [14]:
# temp = pd.DataFrame(people['fenway'], columns=['last_name', 'article_id'])
# temp = temp.drop(['article_id'], axis=1)
# temp['subneighborhood'] = 'fenway'
# temp.head()

In [15]:
final_df = pd.DataFrame(columns=['last_name', 'subneighborhood'])
subs = white_neighborhoods + black_neighborhoods
#subs.remove('longwood_medical_area')
for col in subs:
    temp = pd.DataFrame(people[col], columns=['last_name', 'article_id'])
    temp = temp.drop(['article_id'], axis=1)
    temp['subneighborhood'] = col
    preds = pred_census_ln(temp, 'last_name', 2010)
    final_df = pd.concat([final_df, preds], axis=0)
    print(col + ' DONE')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
fenway DONE
beacon_hill DONE
downtown DONE
south_boston DONE
east_boston DONE
back_bay DONE
jamaica_plain DONE
south_end DONE
charlestown DONE
brighton DONE
allston DONE
west_end DONE
roslindale DONE
north_end DONE
mission_hill DONE
harbor_islands DONE
longwood_medical_area DONE
west_roxbury DONE
dorchester DONE
roxbury DONE
mattapan DONE
hyde_park DONE


In [16]:
final_df = final_df.drop(['api', 'black', 'hispanic', 'white'], axis=1)

In [17]:
hen = final_df['race'].value_counts()
pd.DataFrame(hen)

Unnamed: 0,race
white,95308
api,16330
hispanic,9557
black,956


In [18]:
groups = final_df.groupby(final_df.subneighborhood)
race_df = pd.DataFrame(columns=subs)
races = ['api', 'black', 'hispanic', 'white']
for col in subs:
    group = groups.get_group(col)
    race_df[col] = group.race.value_counts()

In [19]:
race_df = race_df.fillna(0.0)
race_df

Unnamed: 0,fenway,beacon_hill,downtown,south_boston,east_boston,back_bay,jamaica_plain,south_end,charlestown,brighton,...,roslindale,north_end,mission_hill,harbor_islands,longwood_medical_area,west_roxbury,dorchester,roxbury,mattapan,hyde_park
white,3025,3917,2608,10727,2443,2493,3780,2742,1139,1693,...,2658,1436.0,1024,192.0,2.0,2088,29127,15016,2563,5901
api,606,513,480,1347,580,490,624,476,189,553,...,715,168.0,174,30.0,0.0,377,4636,2524,382,1349
hispanic,396,154,181,613,424,86,338,620,69,54,...,255,56.0,194,6.0,0.0,131,4040,1114,227,552
black,10,4,16,82,23,12,92,6,20,6,...,8,0.0,2,0.0,0.0,10,385,149,34,61


In [20]:
for i in range(len(subs)):
    tot = race_df[subs[i]].sum()
    for j in race_df.index:
        race_df.loc[j, [subs[i]]] = (race_df.loc[j, [subs[i]]] / tot) * 100

In [21]:
race_df

Unnamed: 0,fenway,beacon_hill,downtown,south_boston,east_boston,back_bay,jamaica_plain,south_end,charlestown,brighton,...,roslindale,north_end,mission_hill,harbor_islands,longwood_medical_area,west_roxbury,dorchester,roxbury,mattapan,hyde_park
white,74.93188,85.374891,79.391172,84.008145,70.403458,80.915287,78.196111,71.331946,80.381087,73.417173,...,73.10231,86.506024,73.457676,84.210526,100.0,80.122794,76.272651,79.859597,79.943855,75.047692
api,15.011147,11.181343,14.611872,10.548986,16.714697,15.903927,12.908564,12.382934,13.338038,23.980919,...,19.664466,10.120482,12.482066,13.157895,0.0,14.466616,12.139939,13.42339,11.915159,17.156302
hispanic,9.809264,3.356582,5.509893,4.800689,12.21902,2.791302,6.992139,16.129032,4.869442,2.341717,...,7.013201,3.373494,13.916786,2.631579,0.0,5.026861,10.57924,5.924587,7.080474,7.020221
black,0.247709,0.087184,0.487062,0.64218,0.662824,0.389484,1.903186,0.156087,1.411433,0.260191,...,0.220022,0.0,0.143472,0.0,0.0,0.38373,1.00817,0.792427,1.060512,0.775785


In [22]:
race_df.to_csv('First_Name_Race_Representation_by_Neighborhood/2018_representation.csv')