# Step 5: Further Analysis

In here I'd like to combine demographic information and relate data with each other for a deeper layer of analysis.

In [51]:
# Import relevant libraries

import pandas as pd

In [56]:
# Load geocoded data
articles = pd.read_csv("../data/processed/gbh_geocoded_output.csv")

# Load demographics data 
demographics = pd.read_csv("../data/processed/demographics_by_tract.csv")


In [57]:
demographics = demographics.drop(columns=["Index", "Name", "State", "County"])

In [58]:
# For now, only Suffolk has neighborhoods. So drop the rest
articles.dropna(subset=["Neighborhood"], inplace=True)

In [59]:
# Function to relate articles with demographics
def merge_articles_with_demographics(articles_df, demographics_df):
    # Merge articles_df with demographics_df on census tract
    merged_df = pd.merge(articles_df, demographics_df, on='Census Tract', how='left')
    return merged_df

# Relate demographics with articles
merged_articles = merge_articles_with_demographics(articles, demographics)
merged_articles.head(3)


Unnamed: 0,Index,ID,Coordinates,Block,Census Tract,Neighborhood,County,Closest Topic,Publication Date,Total,...,African American,African American Percent,American Indian and Alaska Native,American Indian and Alaska Native Percent,Asian,Asian Percent,Native Hawaiian and Other Pacific Island,Native Hawaiian and Other Pacific Island Percent,Other,Other Percent
0,0,65adafef8d9d92f2327ea8ff,"[-71.0201972, 42.3665992]",1044,981300,East Boston,25,Weather,Mon Dec 18 12:34:19 EST 2023,79.0,...,4.0,5.06,0.0,0.0,4.0,5.06,0.0,0.0,0.0,0.0
1,1,65adafef8d9d92f2327ea923,"[-71.06939, 42.3561948]",1000,981700,Downtown,25,State Politics,Thu Nov 30 14:12:21 EST 2023,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,65adafef8d9d92f2327ea8fd,"[-71.148182, 42.357122]",2032,102,Allston,25,GBH,Mon Dec 18 16:00:52 EST 2023,3714.0,...,298.0,8.02,8.0,0.22,883.0,23.77,2.0,0.05,177.0,4.77


In [75]:
# Get the main 3 races for a specific tract
def get_race_for_tract(tract):
    race_data = demographics[demographics['Census Tract'] == tract].copy()
    race_data = race_data.drop(columns=["Census Tract", "Total", "Hispanic or Latino", "Not Hispanic or Latino", "One Race Total"])
    race_data = race_data.reset_index()

    # Sort races by population/percentage and get main 3
    main_3_races = race_data.sort_values(by=0, axis=1, ascending=False).iloc[:, :3]
    return main_3_races

In [88]:
# Example
tract = 30302
races = get_race_for_tract(tract).T.reset_index()
races.columns = ["Race", "Population"]

print(f"The main races in the tract {str(tract)} are: \n" + races.to_string(index=False))

The main races in the tract 30302 are: 
            Race  Population
           White        1300
           Asian         195
African American         103


In [89]:
# Get the topics for all census tracts

# Group by census tract and aggregate topics
topic_counts = merged_articles.groupby(['Census Tract', 'Closest Topic']).size().reset_index(name='counts')

# Determine all topics for each tract
all_topics = topic_counts.groupby('Census Tract')['Closest Topic'].apply(list).reset_index(name='All Topics')

# Determine the main 3 topics for each tract
top_3_topics = topic_counts.sort_values(['Census Tract', 'counts'], ascending=[True, False])
top_3_topics = top_3_topics.groupby('Census Tract').head(3).reset_index(drop=True)
main_3_topics = top_3_topics.groupby('Census Tract')['Closest Topic'].apply(list).reset_index(name='Main 3 Topics')

# Merge the results
result = pd.merge(all_topics, main_3_topics, on='Census Tract')
result

Unnamed: 0,Census Tract,All Topics,Main 3 Topics
0,102,"[GBH, Higher Education, Immigration, Labor/Wor...","[GBH, Other, Higher Education]"
1,302,[Housing/Homelessness],[Housing/Homelessness]
2,402,[Other],[Other]
3,502,"[Gender issues, Other, Race]","[Gender issues, Other, Race]"
4,506,[Housing/Homelessness],[Housing/Homelessness]
...,...,...,...
99,981201,[Local Politics],[Local Politics]
100,981300,"[Accessiblity/Disablity, Government, Homeland ...","[Housing/Homelessness, Weather, Homeland Secur..."
101,981501,[Local Politics],[Local Politics]
102,981700,"[Aging/Seniors, Civil Rights, Equity & Justice...","[Politics/Elections, State Politics, Equity & ..."


In [90]:
# Get the topics for a specific tract
def get_topics_for_tract(tract):
    topics = result[result["Census Tract"] == tract]
    formattedTopics = topics["Main 3 Topics"].values[0]
    formattedTopicss = str(formattedTopics[0]) + ", " + str(formattedTopics[1]) + ", " + str(formattedTopics[2])
    return formattedTopicss

# Example
tract = 30302
print(f"The main topics in the tract {str(tract)} are: " + str(get_topics_for_tract(tract)))

The main topics in the tract 30302 are: Local Politics, Other, Politics/Elections


In [91]:
# For a given topic, get tracts where it's found
def get_tracts_for_topic(topic):
    tracts = result[result["Main 3 Topics"].apply(lambda x: topic in x)]
    return tracts["Census Tract"].values


In [93]:
# Example
topic = "Public Health"
print(f"The tracts where the topic {topic} is found are: " + str(get_tracts_for_topic(topic)))


The tracts where the topic Public Health is found are: [81001 91200]


In [None]:
# For a given topic, get the most popular tracts where it's found
def get_most_popular_tracts_for_topic(topic):
    tracts = result[result["Main 3 Topics"].apply(lambda x: topic in x)]
    tracts = tracts.sort_values(by="Main 3 Topics", ascending=False)
    return tracts["Census Tract"].values


In [None]:
# For a given race, get the amount of articles (weighted average)


In [None]:
# For a given tract, get the amount of articles per race (weighted average)


In [None]:
# For a given tract, get the amount of articles per topic (weighted average)


In [None]:
# For a given topic, get the most common races


In [282]:
# Get main 3 races for every tract
race_data = pd.DataFrame(columns=['Census Tract', "Race 1", "Race 2", "Race 3"])

for tract in demographics["Tract"]:
    main_races = get_race_for_tract(tract).T
    race_data[tract] = [f'{main_races[0].index[0]}, {main_races[0][0]}',f'{main_races[0].index[1]}, {main_races[0][1]}', f'{main_races[0].index[2]}, {main_races[0][2]}']

race_data.head(3)


  race_data[tract] = [f'{main_races[0].index[0]}, {main_races[0][0]}',f'{main_races[0].index[1]}, {main_races[0][1]}', f'{main_races[0].index[2]}, {main_races[0][2]}']
  race_data[tract] = [f'{main_races[0].index[0]}, {main_races[0][0]}',f'{main_races[0].index[1]}, {main_races[0][1]}', f'{main_races[0].index[2]}, {main_races[0][2]}']
  race_data[tract] = [f'{main_races[0].index[0]}, {main_races[0][0]}',f'{main_races[0].index[1]}, {main_races[0][1]}', f'{main_races[0].index[2]}, {main_races[0][2]}']
  race_data[tract] = [f'{main_races[0].index[0]}, {main_races[0][0]}',f'{main_races[0].index[1]}, {main_races[0][1]}', f'{main_races[0].index[2]}, {main_races[0][2]}']
  race_data[tract] = [f'{main_races[0].index[0]}, {main_races[0][0]}',f'{main_races[0].index[1]}, {main_races[0][1]}', f'{main_races[0].index[2]}, {main_races[0][2]}']
  race_data[tract] = [f'{main_races[0].index[0]}, {main_races[0][0]}',f'{main_races[0].index[1]}, {main_races[0][1]}', f'{main_races[0].index[2]}, {main_races[0

Unnamed: 0,Census Tract,Race 1,Race 2,Race 3,981300,981700,102,81400,10103,70201,...,70402,160601,70700,506,704,91700,91001,80300,92000,30500
0,,,,,"White, 54","White, 1","White, 1565","African American, 1324","White, 2661","White, 2230",...,"White, 1473","White, 1205","White, 1372","White, 2000","White, 2669","African American, 1311","White, 1766","African American, 1045","African American, 2218","White, 2395"
1,,,,,"African American, 4","African American, 0","Asian, 883","White, 1172","Asian, 942","Asian, 1264",...,"Asian, 1308","African American, 325","African American, 486","Asian, 477","Asian, 1387","White, 208","Asian, 702","White, 146","Asian, 651","Asian, 108"
2,,,,,"Asian, 4","American Indian and Alaska Native, 0","African American, 298","Asian, 326","African American, 352","African American, 179",...,"African American, 374","Asian, 181","Asian, 265","African American, 75","African American, 287","Other, 195","African American, 275","Asian, 80","White, 403","African American, 33"


In [128]:
# Function to relate race distributions with topics
def relate_race_with_topics(articles_df, demographics_df, race_column):
    # Merge articles_df with demographics_df on census tract
    merged_df = pd.merge(articles_df, demographics_df, on='Census Tract', how='left')
    
    # Group by topics and calculate race distribution
    race_distribution = merged_df.groupby('Topics')[race_column].mean().reset_index()
    return race_distribution

relate_race_with_topics(merged_articles, demo, 'White')

KeyError: 'Topics'