In [None]:
'''
This script extracts and updates census tract IDs for articles from the Boston Globe dataset for 
the year 2015 using coordinates. This script was run for all years
'''

In [2]:
# Importing the necessary libraries
import pandas as pd
import requests
from mapbox import Geocoder

In [3]:
# Loading Boston Globe dataset 
bostonGlobe_2015 = pd.read_csv('/Users/rachelgupta/Desktop/NAACP - PIT NE/pitne-bias-in-media/summer24/Data/Combined_Datasets_2/2015/Boston_Globe_2015_5000_articles_approx.csv')


In [4]:
# Setting the display precision for floating-point numbers in pandas DataFrames to 7 decimal places
pd.set_option('display.precision', 7)


In [5]:
# Function to safely extract and clean coordinates
def extract_coordinates(coord):
    if pd.isna(coord):
        return None, None
    lat, lon = coord.strip(' []').split(',')
    return float(lat), float(lon)

In [7]:
# Applying the function to extract latitude and longitude
bostonGlobe_2015[['longitude', 'latitude']] = bostonGlobe_2015['NER_Sorted_Coordinates'].apply(lambda x: pd.Series(extract_coordinates(x)))

In [8]:
# Droping rows where coordinates could not be extracted
bostonGlobe_2015.dropna(subset=['latitude', 'longitude'], inplace=True)

In [9]:
bostonGlobe_2015.head()

Unnamed: 0.1,Unnamed: 0,hl1_x,body,llama_prediction,Explicit_Pass_1,NER_Pass_1,NER_Pass_1_Sorted,NER_Pass_1_Coordinates,NER_prediction,NER_Sorted,...,hl2,author,lede,pub_date,indexing_terms,year,actual_body_word_count,body_cleaned,longitude,latitude
0,0,ruling may aid tsarnaev appeal,both cases claim that the potential jurors wer...,"Based on the article provided, I would guess...",,"[(boston, 'GPE'), (monday, 'DATE'), (puerto ri...","[(tsarnaev, 'ORG'), (tsarnaev, 'ORG'), (boston...",,"[(Boston, 'GPE'), (Massachusetts, 'GPE'), (1, ...","[('the First Circuit Appeals Court', 'ORG'), (...",...,puerto rico case suggests bomber can argue mar...,unknown,in a decision that could provide a framework w...,2015-12-08,{'legal': [{'className': 'Criminal Law & Proce...,2015,975,cases claim potential jurors inundated news me...,-71.0588801,42.3600825
2,2,sandoval the one,but sandoval first year in boston was calamito...,Based on the information provided in the art...,,"[(sandoval, 'PERSON'), (first year, 'DATE'), (...","[(sox, 'ORG'), (sox, 'ORG'), (sox, 'ORG'), (th...",,"[(Boston, 'GPE'), (Massachusetts, 'GPE'), (Fen...","[('Fenway Park', 'FAC'), ('Fenway Park', 'FAC'...",...,"prospects devers, chavis still too far away to...",unknown,pablo sandoval's defense at third improved aft...,2015-12-08,"{'subject': [{'score': '90', 'classCode': 'STX...",2015,586,sandoval ' first year boston calamitous every ...,-71.0972178,42.3466764
3,3,long life long on deeds,my dad was one of those guys who didn talk muc...,Here is my response based on the information...,,"[(dan, 'PERSON'), (jerry pothier, 'PERSON'), (...","[(boston, 'GPE'), (maine, 'GPE'), (maine, 'GPE...",,"[(1, 'CARDINAL'), (Boston, 'GPE'), (Jerry Poth...","[('Marines', 'ORG'), ('Pearl Harbor', 'LOC'), ...",...,unknown,unknown,jerry pothier was a medford guy. he was 18 yea...,2015-12-08,"{'subject': [{'score': '90', 'classCode': 'ST0...",2015,543,"dad one guys ' talk much war , son dan said . ...",-71.0588801,42.3600825
4,4,the developer v. the penguins (and their ceo) ...,the bigger the project the more you have to di...,Based on the information provided in the art...,,"[(hillgarth, 'PERSON'), (boston, 'GPE'), (oxfo...","[(the boston redevelopment authority, 'ORG'), ...",,"[(Boston, 'GPE'), (Massachusetts, 'GPE'), (1, ...","[('the Prudential Center', 'FAC'), ('The Prude...",...,unknown,unknown,in his long quest to redevelop the harbor gara...,2015-12-08,"{'legal': [{'className': 'Transportation Law',...",2015,807,"bigger project , dig , says . means loss incom...",-71.0817427,42.3470567
6,6,chism expressed remorse expert says,caused the difficulties that he caused dr. dud...,Based on the information provided in the art...,,"[(dudley macdougall, 'PERSON'), (chism, 'PERSO...","[(danvers high school, 'ORG'), (bridgewater st...",,"[(Danvers, 'GPE'), (Massachusetts, 'GPE'), (1,...","[('Danvers High School', 'FAC'), ('Bridgewater...",...,unknown,unknown,dr. richard dudley testified monday that phili...,2015-12-08,{'legal': [{'className': 'Criminal Law & Proce...,2015,657,"caused difficulties ' caused , dr. dudley ? ma...",-70.9313851,42.5821203


In [11]:
'''
# Function to get tract ID using coordinates
def get_tract_id(lat, lon):
    url = f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lat}&y={lon}&benchmark=Public_AR_Current&vintage=Census2010_Current&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        tract = data['result']['geographies'].get('Census Tracts', [])
        if tract: 
            return tract[0].get('TRACT', 'No TRACT found')
    return "No TRACT found"
   
'''    


In [10]:
# Function to get tracts using coordinates
def get_tract_id(lat, lon):
    url = f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lat}&y={lon}&benchmark=Public_AR_Current&vintage=Census2010_Current&format=json"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            try:
                census_tracts = data['result']['geographies'].get('Census Tracts', [])
                if census_tracts:
                    return census_tracts[0].get('TRACT', 'No TRACT found')
                else:
                    return 'No Census Tracts found'
            except (KeyError, IndexError):
                print(f"Parsing error: {data}")
                return None
        else:
            print(f"Request failed with status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request exception: {e}")
        return None

In [11]:
# Applying function to obtain tract ids
def update_tract_ids(df):
     
    # Initializing 'tract_2010' column
    df['tract_2010'] = None
    
    # Iterating through each row 
    for index, row in df.iterrows():
        lat, lon = row['latitude'], row['longitude']
        if pd.notna(lat) and pd.notna(lon):
            tract_id = get_tract_id(lon, lat)
            df.at[index, 'tract_2010'] = tract_id
        
        else:
            df.at[index, 'tract_2010'] = 'Invalid coordinates'
    
    return df


In [12]:
# Updating the DataFrame with tracts 
updated_bostonGlobe_2015 = update_tract_ids(bostonGlobe_2015)

In [13]:
# Investigating dataframe
updated_bostonGlobe_2015

Unnamed: 0.1,Unnamed: 0,hl1_x,body,llama_prediction,Explicit_Pass_1,NER_Pass_1,NER_Pass_1_Sorted,NER_Pass_1_Coordinates,NER_prediction,NER_Sorted,...,author,lede,pub_date,indexing_terms,year,actual_body_word_count,body_cleaned,longitude,latitude,tract_2010
0,0,ruling may aid tsarnaev appeal,both cases claim that the potential jurors wer...,"Based on the article provided, I would guess...",,"[(boston, 'GPE'), (monday, 'DATE'), (puerto ri...","[(tsarnaev, 'ORG'), (tsarnaev, 'ORG'), (boston...",,"[(Boston, 'GPE'), (Massachusetts, 'GPE'), (1, ...","[('the First Circuit Appeals Court', 'ORG'), (...",...,unknown,in a decision that could provide a framework w...,2015-12-08,{'legal': [{'className': 'Criminal Law & Proce...,2015,975,cases claim potential jurors inundated news me...,-71.0588801,42.3600825,030300
2,2,sandoval the one,but sandoval first year in boston was calamito...,Based on the information provided in the art...,,"[(sandoval, 'PERSON'), (first year, 'DATE'), (...","[(sox, 'ORG'), (sox, 'ORG'), (sox, 'ORG'), (th...",,"[(Boston, 'GPE'), (Massachusetts, 'GPE'), (Fen...","[('Fenway Park', 'FAC'), ('Fenway Park', 'FAC'...",...,unknown,pablo sandoval's defense at third improved aft...,2015-12-08,"{'subject': [{'score': '90', 'classCode': 'STX...",2015,586,sandoval ' first year boston calamitous every ...,-71.0972178,42.3466764,010203
3,3,long life long on deeds,my dad was one of those guys who didn talk muc...,Here is my response based on the information...,,"[(dan, 'PERSON'), (jerry pothier, 'PERSON'), (...","[(boston, 'GPE'), (maine, 'GPE'), (maine, 'GPE...",,"[(1, 'CARDINAL'), (Boston, 'GPE'), (Jerry Poth...","[('Marines', 'ORG'), ('Pearl Harbor', 'LOC'), ...",...,unknown,jerry pothier was a medford guy. he was 18 yea...,2015-12-08,"{'subject': [{'score': '90', 'classCode': 'ST0...",2015,543,"dad one guys ' talk much war , son dan said . ...",-71.0588801,42.3600825,030300
4,4,the developer v. the penguins (and their ceo) ...,the bigger the project the more you have to di...,Based on the information provided in the art...,,"[(hillgarth, 'PERSON'), (boston, 'GPE'), (oxfo...","[(the boston redevelopment authority, 'ORG'), ...",,"[(Boston, 'GPE'), (Massachusetts, 'GPE'), (1, ...","[('the Prudential Center', 'FAC'), ('The Prude...",...,unknown,in his long quest to redevelop the harbor gara...,2015-12-08,"{'legal': [{'className': 'Transportation Law',...",2015,807,"bigger project , dig , says . means loss incom...",-71.0817427,42.3470567,010600
6,6,chism expressed remorse expert says,caused the difficulties that he caused dr. dud...,Based on the information provided in the art...,,"[(dudley macdougall, 'PERSON'), (chism, 'PERSO...","[(danvers high school, 'ORG'), (bridgewater st...",,"[(Danvers, 'GPE'), (Massachusetts, 'GPE'), (1,...","[('Danvers High School', 'FAC'), ('Bridgewater...",...,unknown,dr. richard dudley testified monday that phili...,2015-12-08,{'legal': [{'className': 'Criminal Law & Proce...,2015,657,"caused difficulties ' caused , dr. dudley ? ma...",-70.9313851,42.5821203,211300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4511,4511,in interview be clear about follow up plans,a. business etiquette is vital for companies a...,Based on the information provided in the art...,,"[(glassdoor, 'ORG'), (six hours, 'TIME'), (tod...","[(glassdoor, 'ORG'), (keystone partners, 'ORG'...",,"[(1, 'CARDINAL'), (Boston, 'GPE'), (Massachuse...","[('Keystone Partners', 'ORG'), ('Keystone Part...",...,unknown,"q. after 15 years in europe, i'm interviewing ...",2015-12-27,"{'subject': [{'score': '90', 'classCode': 'ST0...",2015,207,a. business etiquette vital companies individu...,-71.0571176,42.3530449,070101
4512,4512,when off the court he likes to slow down and t...,favorite vacation spot turks and caicos. the w...,Based on the information provided in the art...,,"[(one, 'CARDINAL'), (wisconsin, 'GPE'), (disne...","[(disney world, 'ORG'), (disneyland, 'ORG'), (...",,"[(Boston, 'GPE'), (Massachusetts, 'GPE'), (1, ...","[('Beacon Hill', 'LOC'), ('Beacon Hill', 'LOC'...",...,unknown,evan turner of the celtics took a selfie in mi...,2015-12-27,"{'subject': [{'score': '90', 'classCode': 'ST0...",2015,209,favorite vacation spot ? turks caicos . water ...,-71.0693900,42.3561948,981700
4516,4516,spirit of giving,at left spirited bostonian committee members m...,Based on the information provided in the art...,,"[(bostonian, 'NORP'), (marcia doyle, 'PERSON')...","[(connolly, 'ORG'), (boston, 'GPE')]",,"[(Boston, 'GPE'), (Massachusetts, 'GPE'), (1, ...","[('the ""Rattlesnake Bar Grill""', 'FAC'), ('the...",...,unknown,"at left, spirited bostonian committee members ...",2015-12-27,"{'subject': [{'score': '90', 'classCode': 'ST0...",2015,94,"left , spirited bostonian committee members ma...",-71.0719348,42.3513496,010702
4517,4517,on walls and behind scenes women moved art world,echelman has worldwide following. she deserves...,"1.Y - The article is talking about Boston, s...",,"[(echelman, 'PERSON'), (joan jonas, 'PERSON'),...","[(mit, 'ORG'), (the rose art museum, 'ORG'), (...",,"[(Boston, 'GPE'), (New England, 'LOC'), (Bosto...","[('the Rose Art Museum', 'FAC'), ('The Wadswor...",...,unknown,"clockwise from top: janet echelman's ""as if it...",2015-12-27,"{'legal': [{'className': 'Education Law', 'cla...",2015,651,echelman worldwide following . deserves attent...,-71.2624995,42.3656832,368400


In [14]:
# Calculating the number of times "No tract found" appears in the updated tract column
no_tract_found_count = (updated_bostonGlobe_2015['tract_2010'] == 'No Census Tracts found').sum()
no_tract_found_count

8

In [15]:
# Removing rows with "No Census Tracts found" 
updated_bostonGlobe_2015.drop(updated_bostonGlobe_2015[updated_bostonGlobe_2015['tract_2010'] == 'No Census Tracts found'].index, inplace=True)

In [18]:
# Saving the updated DataFrame to a new CSV file
updated_bostonGlobe_2015.to_csv('updated_bostonGlobe_2015.csv', index=False)


In [17]:
updated_bostonGlobe_2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2741 entries, 0 to 4520
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              2741 non-null   int64  
 1   hl1_x                   2741 non-null   object 
 2   body                    2741 non-null   object 
 3   llama_prediction        2741 non-null   object 
 4   Explicit_Pass_1         0 non-null      object 
 5   NER_Pass_1              2741 non-null   object 
 6   NER_Pass_1_Sorted       2741 non-null   object 
 7   NER_Pass_1_Coordinates  0 non-null      object 
 8   NER_prediction          2741 non-null   object 
 9   NER_Sorted              2741 non-null   object 
 10  NER_Sorted_Coordinates  2741 non-null   object 
 11  Tracts                  2741 non-null   object 
 12  position_section        2741 non-null   object 
 13  position_subsection     2741 non-null   object 
 14  hl1_y                   2741 non-null   