In [1]:
import re
import csv
import os
import json
import json 
import nltk
import spacy
import requests
import googlemaps 
import pandas as pd
from bs4 import BeautifulSoup
from art import *

import secret # API keys

In [2]:
from google.cloud import storage
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import io
import joblib
import openai
import torch
import pickle
from bertopic import BERTopic

In [4]:
from tqdm import tqdm
from bson import ObjectId
from pymongo import MongoClient

### Census Data Class

In [5]:
"""
This is a class that takes in two csv's. One csv for census data and one for census data overall by neighborhoods.
As of this creation of this class, we are using the Boston Census 2020 data. Please make sure that future 
versions follow the same path.
"""
class census_data():
    def __init__(self, db):
        self.db = db
        self.load_census_data()
        self.load_census_neigh_data()
        self.load_old_census_data()
        
    def load_old_census_data(self):
        try:
            if (self.db == None):
                raise Exception("No database was given!")

            old_census_tracts_db = self.db['Topic_Modeling_Pipeline_Data'].find_one(
                {"_id": ObjectId("65504622dca7e7e77229fb73")}
            )
            
            if (old_census_tracts_db == None):
                raise Exception(f"old_census_tracts_db yielded out None. Length: [{len(old_census_tracts_db)}]")

            del old_census_tracts_db['_id']

            self.old_census_tracts = old_census_tracts_db
        except Exception as err:
            print(f"Error loading Census Data class!\nError: {err}")
            raise Exception("Fatal Error in Class Construction!")
        return
    
    def load_census_data(self):
        try:
            if (self.db == None):
                raise Exception("No database was given!")

            census_tracts_db = self.db['Topic_Modeling_Pipeline_Data'].find_one(
                {"_id": ObjectId("6550372bfac75a7d3f25b870")}
            )
            
            if (census_tracts_db == None):
                raise Exception(f"census_tracts_db yielded out None. Length: [{len(census_tracts_db)}]")

            csv_data = census_tracts_db["csv_data"]
            census_tracts_df = pd.DataFrame(csv_data)

            self.census_tracts = self.process_census_data(census_tracts_df)
        except Exception as err:
            print(f"Error loading Census Data class!\nError: {err}")
            raise Exception("Fatal Error in Class Construction!")
        return
        
    def load_census_neigh_data(self):
        try:
            if (self.db == None):
                raise Exception("No database was given!")

            census_neigh_data_db = self.db['Topic_Modeling_Pipeline_Data'].find_one(
                {"_id": ObjectId("65503411fac75a7d3f25b86f")}
            )
            
            if (census_neigh_data_db == None):
                raise Exception(f"census_neigh_data_db yielded out None. Length: [{len(census_neigh_data_db)}]")

            csv_data = census_neigh_data_db["csv_data"]
            census_neigh_data_df = pd.DataFrame(csv_data)

            self.census_neighbourhoods = self.process_census_neigh_data(census_neigh_data_df)
        except Exception as err:
            print(f"Error loading Census Data class!\nError: {err}")
            raise Exception("Fatal Error in Class Construction!")
        return

    def process_census_data(self, df):
        demographics = df.iloc[:,11:20]
        geoid_tract = df['GEOCODE']
        tract = df['TRACT']

        concat_pd = pd.concat([tract, geoid_tract, demographics], axis=1)
        concat_pd.drop(concat_pd.index[0], inplace=True)
        concat_pd = concat_pd.rename(
            columns={
                'TRACT': 'tract', 
                'GEOCODE': 'geoid_tract',
                'P0020001': 'total'        
            }
        )
        return concat_pd

    # A function for the future
    def process_census_neigh_data(self, df):
        df = df.iloc[:,:7]
        df = df.rename(
            columns={
                'tract20_nbhd': 'Neighborhood',       
            }
        )
        df.drop(df.index[0], inplace=True)
        return df

### Neighbourhood Mapping Class

In [6]:
class neighborhood_mapping():
    def __init__(self, db):
        self.db = db
        self.load_mappings()
    
    def load_mappings(self):
        try:
            if (self.db == None):
                raise Exception("No database was given!")

            # load census tract to boston neighborhood mapping 
            # load census block to boston neighborhood mapping 
            tract_map_db = self.db['Entity_Recognition_Pipeline_Data'].find_one(
                {"_id": ObjectId("654ad02fd4c2ceb2f5d5ddfc")}
            )

            block_map_db = self.db['Entity_Recognition_Pipeline_Data'].find_one(
                {"_id": ObjectId("654ad118d4c2ceb2f5d5ddfd")}
            )
            
            if (tract_map_db == None or block_map_db == None):
                raise Exception(f"tract_map_db or block_map_db yielded None respectively. Length: [{len(tract_map_db)},{(block_map_db)}]")

            del tract_map_db['_id']
            del block_map_db['_id']
            
            self.tract_mapping = tract_map_db
            self.block_mapping = block_map_db
        except Exception as err:
            print(f"Error loading neighborhood mapping class!\nError: {err}")
            raise Exception("Fatal Error in Class Construction!")
        return

    def tract_to_neighborhood(self, tract):
        # given a census tract return the boston neighborhood it is in 
        return self.tract_mapping[tract]

    def block_to_neighborhood(self, block):
        # given a census block return the boston neighborhood it is in 
        return self.block_mapping(block)

### Geography Class

In [7]:
class geography():
    def __init__(self, db):
        self.db = db
        self.load_geographies()
        self.load_org_entities()
        self.load_saved_geocodes()

    def load_saved_geocodes(self):
        try:
            if (self.db == None):
                raise Exception("No database was given!")
                
            saved_geocodes_db = self.db['Entity_Recognition_Pipeline_Data'].find_one(
                {"_id": ObjectId("654ad709d4c2ceb2f5d5de00")}
            )
            
            if (saved_geocodes_db == None):
                raise Exception(f"saved_geocodes_db yielded None. Length: [{len(saved_geocodes_db)}]")

            del saved_geocodes_db['_id']
            
            self.saved_geocodes = saved_geocodes_db
        except Exception as err:
            print(f"Error loading geography class!\nError: {err}")
            raise Exception("Fatal Error in Class Construction!")
        return

    def load_geographies(self):
        # static data, states, towns, orgs in entity output format to filter out of geocoding results 
        self.load_state_entities()
        self.load_mass_town_entities()
    
    def load_state_entities(self):
        try:
            if (self.db == None):
                raise Exception("No database was given!")
                
            state_entities_db = self.db['Entity_Recognition_Pipeline_Data'].find_one(
                {"_id": ObjectId("654ae0edd4c2ceb2f5d5de33")}
            )
            
            if (state_entities_db == None):
                raise Exception(f"state_entities_db yielded None. Length: [{len(state_entities_db)}]")

            csv_data = state_entities_db["csv_data"]
            state_entities_df = pd.DataFrame(csv_data)

            states_set = set()
            for idx, row in state_entities_df.iterrows():
                tup = (row.iloc[0], 'LOC')
                states_set.add(tup)              
            self.state_entities = states_set
        except Exception as err:
            print(f"Error loading geography class!\nError: {err}")
            raise Exception("Fatal Error in Class Construction!")
        return
    
    def load_mass_town_entities(self):
        try:
            if (self.db == None):
                raise Exception("No database was given!")
                
            mass_town_entities_db = self.db['Entity_Recognition_Pipeline_Data'].find_one(
                {"_id": ObjectId("654ae26ed4c2ceb2f5d5de34")}
            )
            
            if (mass_town_entities_db == None):
                raise Exception(f"mass_town_entities_db yielded None. Length: [{len(mass_town_entities_db)}]")

            csv_data = mass_town_entities_db["csv_data"]
            mass_town_entities_df = pd.DataFrame(csv_data)

            towns_set = set()
            for idx, row in mass_town_entities_df.iterrows():
                tup = (row.iloc[0], 'LOC')
                towns_set.add(tup)              
            self.mass_town_entities = towns_set
        except Exception as err:
            print(f"Error loading geography class!\nError: {err}")
            raise Exception("Fatal Error in Class Construction!")
        return
    
    def load_org_entities(self):
        self.org_entities = (('GBH News', 'ORG'), ('Boston Public Radio', 'ORG'), 
                             ('Supreme Court', 'ORG'), ('New York Times', 'ORG'), 
                             ('Washington Post', 'ORG'), ('CNN', 'ORG'), 
                             ('NPR', 'ORG'), ('Associated', 'ORG'), 
                             ('Press', 'ORG'), ('Senate', 'ORG'), 
                             ('Associated Press', 'ORG'), ('AP', 'ORG'), 
                             ('ABC News', 'ORG'),('CSS', 'ORG'), 
                             ('Philadelphia Inquirer', 'ORG'), ('House', 'ORG'),
                             ('Congress', 'ORG'), ('Worcester', 'ORG'),
                             ('FBI', 'ORG'), ('Homeland Security Department', 'ORG'),
                             ('CDC', 'ORG'),('Fox News', 'ORG'),('The Washington Post', 'ORG'),
                             ('States', 'LOC'), ('S.', 'LOC'), ('Massachusetts', 'ORG'),
                             ('White House', 'ORG'), ('High School', 'ORG'),
                             ('MIT', 'ORG'), ('Harvard University', 'ORG'),
                             ('White House', 'LOC'),('Greater Boston', 'LOC'),
                             ('New England', 'LOC'))

In [8]:
def load_bert_NER():
    # loading bert NER model 
    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
    model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="max")

    return nlp

In [9]:
def load_bert_TOPIC():
    #loading bert Topic model
    topic_model = BERTopic.load("./llm_models/BERTopic_CPU_M1") # BERTopic model dir
    return topic_model

In [10]:
def get_locations_bert(article_text, nlp):
    """
    get location names from article using NER - bert model 
    https://huggingface.co/dslim/bert-base-NER
    input: article_text as a string, aggregate of h1, h2, lede, and body
    returns: locations - set of tuples of (NAME, 'LOC') and organizations - set of tuples (NAME, 'ORG) mentioned in the article
    """
    
    ner_results = nlp(article_text)
    locations = set([(X['word'],X['entity_group']) for X in ner_results if X['entity_group'] == 'LOC'])
    orgs = set([(X['word'], X['entity_group']) for X in ner_results if X['entity_group'] == 'ORG'])

    return locations, orgs

def clean_entity_results(extracted_loc, extracted_orgs, drop_geos):
    # cleaning extracted entities from bert 
    # removing state names, and mass town names since the demographics data is too broad
    # return cleaned set of entities
    entity_result = extracted_loc | extracted_orgs

    for tup in extracted_loc | extracted_orgs:
        if len(tup[0]) <= 1:
            entity_result.remove(tup)
        elif tup in drop_geos.state_entities:
            entity_result.remove(tup)
        elif tup in drop_geos.mass_town_entities:
            entity_result.remove(tup)
        elif tup in drop_geos.org_entities:
            entity_result.remove(tup)
    return entity_result

def remove_existing_geocodes(entity_result, saved_geocodes):
    # check if any locations or organizations were recognized
    # check if the geocodes already exist in dictionary
    existing_loc_geocode = {}
    new_loc_geocode = set()
    for ent in entity_result:
        try:
            existing_loc_geocode[ent[0]] = saved_geocodes[ent[0]]
        except KeyError:
            new_loc_geocode.add(ent)
    return existing_loc_geocode, new_loc_geocode

def get_snippet(sentences, num_sent, lede=True, remaining_text=False):
    """
    get the snippet of text from the article_text, replace single quotes
    input: article text, and num_sent - number of sentences to return, default lede is true will return first x sentences
           reamaining_text then must be False 
    returns: first x (num_sent) sentences
    """
    #clean_text = clean_article_text(text)
    #clean_text = ". ".join(clean_text.split(".")) # adding a space after period so nltk can do a better job recognizing sentences
    #lede = nltk.sent_tokenize(clean_text)[:num_sent] # returns a list
    
    if lede: # get the first num_sent 
        lede_text = sentences[:num_sent]
        result_text = " ".join(lede_text)
    elif remaining_text: # get rest of article num_sent * 2 until the end
        result_text = sentences[num_sent*2:]
        result_text = " ".join(result_text)
    else: # get sentences num_sent to num_sent * 2
       result_text = sentences[num_sent:num_sent*2]
       result_text = " ".join(result_text) 
    
    singleq = result_text.replace('’', "'")

    return singleq

def get_sentences(text):
    # return article text as a list of its sentences 

    clean_text = clean_article_text(text)
    clean_text = ". ".join(clean_text.split(".")) # adding a space after period so nltk can do a better job recognizing sentences
    sentences = nltk.sent_tokenize(clean_text)

    return sentences

def clean_article_text(text):
    # get text, removing html tags
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text()
    return clean_text

In [11]:
def get_location_geocode(API_KEY, locations):
    """
    getting coordinates from location names in articles 
    input: google maps platform API KEY, locations article 
    return: dictionary of location names (key) with coordinates (value as a dictionary with lat and lon as keys)
    """
    gmaps = googlemaps.Client(key=API_KEY)
    results = {}

    # getting coordinates
    for place in locations:
        # we can constrain google geocode api search to massachusetts or us - census geocoder will not work for places outside of U.S 
        #geocode_result = gmaps.geocode(place[0] + ", Suffok County, MA, USA") # place is a tuple, where first value is the location name 
        geocode_result = gmaps.geocode(place[0] + ", Suffolk County",  components={"administrative_area_level": "MA", 
                                                                                   "country": "US"})
        #print(geocode_result)
        #print()
        temp = {}
        try:
            geocode_components = geocode_result[0]['address_components']
            for i, addr_comp in enumerate(geocode_components):
                if 'administrative_area_level_2' in addr_comp['types']:
                    if "Suffolk County" == addr_comp['short_name'] and i != 0:
                        temp['lat'] = geocode_result[0]['geometry']['location']['lat']
                        temp['lon'] = geocode_result[0]['geometry']['location']['lng']
                        results[place[0]] = temp
                        
        except IndexError: # unable to get coordinates for location
            print("Unable to locate " + place[0])

    return results 

In [12]:
def get_census_geos(geocode_results):
    """
    get census geographies - tract, block group, block by coordinates
    input: google maps geocode_results as a dictionary
    return: block, block_group, tract, county for each location
    """
    census_geos = {}
    for place in geocode_results:
        # building the geocoding url
        base_url = f'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?'
        survey_ver = f'&benchmark=4&vintage=4&layers=2020 Census Blocks&format=json'
        lon = geocode_results[place]['lon']
        lat = geocode_results[place]['lat']
        census_geo_url = f'{base_url}x={lon}&y={lat}{survey_ver}'

        # getting the census geographies 
        response = requests.get(census_geo_url)
        response_json = response.json()

        try:
            block = response_json['result']['geographies']['2020 Census Blocks'][0]['BLOCK']
            block_group = response_json['result']['geographies']['2020 Census Blocks'][0]['BLKGRP']
            tract = response_json['result']['geographies']['2020 Census Blocks'][0]['TRACT']
            county = response_json['result']['geographies']['2020 Census Blocks'][0]['COUNTY']
            census_geos[place] = {'block': block,
                                  'blkgrp': block_group,
                                  'tract': tract,
                                  'county': county}
        except IndexError:
            print("Unable to retrieve census geography for: " + place)
        except KeyError:
            print("Location is outside of the United States: " + place)
    return census_geos

In [13]:
"""
# CENSUS DATA API 
# https://www.census.gov/content/dam/Census/library/publications/2020/acs/acs_api_handbook_2020_ch02.pdf
# any user can query small quantities of data with minimal restrictions - up to 50 variables in a single query, up to 500 queries per IP address per day 
# more than 500 queries per IP address per day requires you to register for API key - www.census.gov/developers
# https://www.census.gov/data/developers/data-sets/decennial-census.html 
"""
def get_census_demographics(year, dsource, dname, tract, county, state):
    # input: census year, data source, survey name, tract, county, state
    # return: demographic data for tract mentioned
    
    # census variables: https://api.census.gov/data/2020/dec/pl/variables.html 
    cols = 'NAME,P2_001N,P2_002N,P2_003N,P2_004N,P2_005N,P2_006N,P2_007N,P2_008N,P2_009N,P2_010N'
    base_url = f"https://api.census.gov/data/{year}/{dsource}/{dname}"

    # to get tract demographics 
    census_url = f"{base_url}?get={cols}&for=tract:{tract}&in=county:{county}&in=state:{state}"

    # to get block demographics 
    # census_url = f"{base_url}?get={cols}&for=block:{block}&in=tract:{tract}&in=county:{county}&in=state:{state}"

    census_response = requests.get(census_url)
    census_response_json = census_response.json()

    return census_response_json

def run_entity_recognition(text, nlp, drop_geos, saved_geocodes):
    # running entity recogntion on text
    # parse existing geocoded entities and new geocoded entities
    try:
        extracted_loc, extracted_orgs = get_locations_bert(text, nlp)
        ent_result = clean_entity_results(extracted_loc, extracted_orgs, drop_geos)
        existing_loc_geocode, new_loc_geocode = remove_existing_geocodes(ent_result, saved_geocodes)
    except TypeError as e:
        print(f"No entities: {e}")
        existing_loc_geocode = {}
        new_loc_geocode = set()

    return existing_loc_geocode, new_loc_geocode

def run_location_geocode(API_KEY, new_loc_geocode):
    # get geocodes for NEW locations and saving them to json
    # returns new location geocodes as dictionary 
    location_geocode = {}
    if new_loc_geocode:
        location_geocode = get_location_geocode(API_KEY, new_loc_geocode)
    return location_geocode

def check_snippets(API_KEY, new_entities, existing_entities):
    location_geocode = run_location_geocode(API_KEY, new_entities)
    existing_loc_geocode = existing_entities
    combined_geocodes = location_geocode | existing_loc_geocode # if this is empty, then try the next snippet of text 
    return (not combined_geocodes), location_geocode, existing_loc_geocode

In [14]:
def run_pipeline(year, dsource, dname, state, existing_loc_geocode, location_geocode, mappings):
    #location_geocode = {'Boston': {'lat': 42.3600825, 'lon': -71.0588801}, 'Massachusetts': {'lat': 42.4072107, 'lon': -71.3824374}, 'Boston city': {'lat': 42.3600825, 'lon': -71.0588801}, 'Roxbury': {'lat': 42.3125672, 'lon': -71.0898796}, 'Fitchburg': {'lat': 42.5834228, 'lon': -71.8022955}, 'Medford': {'lat': 42.4184296, 'lon': -71.1061639}}
    #location_geocode = {'Massachusetts': {'lat': 42.4072107, 'lon': -71.3824374}, 'Salem': {'lat': 42.5197473, 'lon': -70.8954626}, 'Salem City Hall': {'lat': 42.5218851, 'lon': -70.8956157}}
    #location_geocode = {'Salem': {'lat': 42.5197473, 'lon': -70.8954626}, 'Massachusetts': {'lat': 42.4072107, 'lon': -71.3824374}, 'Salem City Hall': {'lat': 42.5218851, 'lon': -70.8956157}}

    #print(location_geocode | existing_loc_geocode)
    
    census_geos = get_census_geos(location_geocode | existing_loc_geocode)

    result = []
    for place_name in census_geos:
        place_info = {}
        county = census_geos[place_name]['county']
        tract = census_geos[place_name]['tract']
        
        try:
            demographic_results = get_census_demographics(year, dsource, dname, tract, county, state)

            # build result dictionary 
            place_info[place_name] = {'county_code': county} 
            place_info[place_name] = {'county_name': demographic_results[1][0]}
            place_info[place_name]['tract'] = tract
            geoid_tract = state + county + tract # this includes the state and county and tract number
            place_info[place_name]['geoid_tract'] = geoid_tract

            if mappings.tract_mapping.get(geoid_tract): # get corresponding boston neighborhood 
                place_info[place_name]['neighborhood'] = mappings.tract_mapping[state + county + tract]
            
            place_info[place_name]['demographics'] = {
                'p2_001n': demographic_results[1][1], # total population 
                'p2_002n': demographic_results[1][2], # total hispanic or latino 
                'p2_003n': demographic_results[1][3], # total not hispanic or latino 
                'p2_004n': demographic_results[1][4], # total not hispanic or latino - pop of one race
                'p2_005n': demographic_results[1][5], # total not hispanic or latino - pop of one race - white alone 
                'p2_006n': demographic_results[1][6], # total not hispanic or latino - pop of one race - black or african american alone
                'p2_007n': demographic_results[1][7], # total not hispanic or latino - pop of one race - american indian and alaska native alone
                'p2_008n': demographic_results[1][8], # total not hispanic or latino - pop of one race - asian alone 
                'p2_009n': demographic_results[1][9], # total not hispanic or latino - pop of one race - native hawaiian and other pacific islander alone
                'p2_010n': demographic_results[1][10] # total not hispanic or latino - pop of one race - some other race alone 
            } 
            
            result.append(place_info)
        except Exception as e:
            print(e)
            print("Unable to get census demographics for: " + place_name)
    
    return result

### Incoming CSV

Here we assume an incoming csv...

In [15]:
df = pd.read_csv("./Articles Nov 2020 - March 2023.csv", low_memory=False)
df = df.iloc[:, : 12]
df.shape

(12795, 12)

In [16]:
df.columns

Index(['Type', 'Label', 'Headline', 'Byline', 'Section Navigation', 'Section',
       'Tagging', 'Title', 'Paths', 'Publish Date', 'Has Path?', 'Body'],
      dtype='object')

In [17]:
for x in df.columns:
    print(x)

Type
Label
Headline
Byline
Section Navigation
Section
Tagging
Title
Paths
Publish Date
Has Path?
Body


In [18]:
df_test = df.drop(columns=[
    'Has Path?',
    'Title',
    'Section Navigation',
])

In [19]:
list(df_test.columns)

['Type',
 'Label',
 'Headline',
 'Byline',
 'Section',
 'Tagging',
 'Paths',
 'Publish Date',
 'Body']

In [20]:
df_test

Unnamed: 0,Type,Label,Headline,Byline,Section,Tagging,Paths,Publish Date,Body
0,Article,A Celtic Playlist For Easter,A Celtic Playlist For Easter,Brian O'Donovan,Celtic,0000016a-3bcb-d661-af7b-7bff0d200001,/music/celtic/2019/04/20/a-celtic-playlist-for...,Mon Mar 29 15:22:35 EDT 2021,The above is a continuous stream. <br/><br/>I...
1,Article,Songs Of War And Remembrance: Memorial Day,Songs Of War And Remembrance: Memorial Day,Brian O'Donovan,Celtic,0000016a-f0d5-dbfd-a56f-f4dfc34c0001,/music/celtic/2019/05/25/songs-of-war-and-reme...,Sat May 29 01:00:45 EDT 2021,Click above for the audio of a special segment...
2,Article,Words And Music: Father's Day,Words And Music: Father's Day,Brian O'Donovan,Celtic,0000016b-539d-d757-adef-f7bd4a5f0001,/music/celtic/2019/06/13/words-and-music-fathe...,Fri Jun 18 01:00:28 EDT 2021,"In honor of Father&#39;s Day, this segment of ..."
3,Article,Celebrating The Birthday Of Robert Burns — Jan...,Celebrating The Birthday Of Robert Burns — Jan...,Brian O'Donovan,Celtic,0000016f-c48a-d14c-a57f-e59b02310001,/music/celtic/2020/01/21/celebrating-the-birth...,Sat Jan 23 08:47:27 EST 2021,Robert Burns is known as &quot;Scotland&#39;s ...
4,Article,Ten Celtic Love Songs For St. Valentine's Day,Ten Celtic Love Songs For St. Valentine's Day,Brian O'Donovan,Celtic,00000170-273a-d4d2-a378-677e83f60001,/celtic/ValentinesDay (Permalink),Wed Feb 10 12:09:13 EST 2021,"Folk music generally, and Celtic music, in par..."
...,...,...,...,...,...,...,...,...,...
12790,Article,5 key takeaways from the Trump indictment news,5 key takeaways from the Trump indictment news,"Emily Olson, Emma Bowman",National News,00000187-3753-da17-afc7-f77b1ea40001,/national-news/2023/03/31/5-key-takeaways-from...,Fri Mar 31 06:26:00 EDT 2023,Former president Donald Trump has been indicte...
12791,Article,Harvard professor says government should pause...,Harvard professor says government should pause...,Alexi Cohan,Science and Technology,00000187-37f6-d65f-a1f7-bffee1410001,/science-and-technology/2023/03/31/harvard-pro...,Fri Mar 31 11:06:14 EDT 2023,Artificial intelligence has advanced rapidly i...
12792,Article,Why Trump isn't the first president to face ar...,Why Trump isn't the first president to face ar...,"Dustin Jones, Kaitlyn Radde",National News,00000187-37f7-da17-afc7-f7ffec2a0001,/national-news/2023/03/31/why-trump-isnt-the-f...,Fri Mar 31 09:05:00 EDT 2023,Former President Donald Trump was indicted Thu...
12793,Article,These cockroaches tweaked their mating rituals...,These cockroaches tweaked their mating rituals...,Ari Daniel,News,00000187-382e-da17-afc7-f96fcf4c0001,/news/2023/03/31/these-cockroaches-tweaked-the...,Fri Mar 31 10:02:00 EDT 2023,Human attempts to kill cockroaches with sugary...


### Bootstrap the pipeline

Next we have to bootstrap the pipeline and make sure that BERT is up and running/loaded and the variables needed is all good to go.

In [21]:
import sys
import time
import itertools
import threading

class Spinner:
    def __init__(self, message, delay=0.1):
        self.spinner = itertools.cycle(['-', '/', '|', '\\'])
        self.delay = delay
        self.message = message
        self.thread = threading.Thread(target=self.spin)
        self.stop_running = threading.Event()

    def spin(self):
        while not self.stop_running.is_set():
            sys.stdout.write(next(self.spinner))  # write the next character
            sys.stdout.flush()                    # flush stdout buffer (actual character display)
            sys.stdout.write('\b')                # erase the last written char
            time.sleep(self.delay)

    def start(self):
        self.stop_running.clear()
        sys.stdout.write(self.message + ' ')
        self.thread.start()

    def stop(self):
        self.stop_running.set()
        self.thread.join()                       # wait for spinner to stop
        sys.stdout.write('✔️ OK\n')              # write final message
        sys.stdout.flush()

    def err(self):
        self.stop_running.set()
        self.thread.join()                       # wait for spinner to stop
        sys.stdout.write('❌ Error!\n')              # write final message
        sys.stdout.flush()

In [22]:
def fetch_llm_models():
    try:
        storage_client = storage.Client()

        bucket = storage_client.bucket("naacp-models")
        blob = bucket.blob("BERTopic_Models/BERTopic_CPU_M1")
        blob.download_to_filename("./llm_models/BERTopic_CPU_M1")
    except Exception as e:
        print(f"Model fetching failed! {e}")
        raise Exception("Fatal Error in Fetching LLM models. Exiting...")
    return

In [23]:
def load_heirarch_data(db):
    try:
        if (db == None):
            raise Exception("No database was given!")

        heirarch_db = db['Topic_Modeling_Pipeline_Data'].find_one(
            {"_id": ObjectId("654c38cff9689d923f1c3e9c")}
        )
        
        if (heirarch_db == None):
            raise Exception(f"heirarch_db Length: [{len(heirarch_db)}]")

        heirarch_data = heirarch_db["taxonomy_data"]
        return heirarch_data
    except Exception as err:
        print(f"Error loading hierarchal data!\nError: {err}")
        raise Exception("Fatal Error in fetching hierarchal data!")
    return

In [24]:
def bootstrap_pipeline():
    try:
        tprint("BU Spark!", font="sub-zero")
        print("Bootstrapping pipeline... (This should only run once!)")
        print()
        spinner = Spinner("Setting up variables...")
        spinner.start()
        year='2020'
        dsource='dec' # which survey are we interested in ? decennial 
        dname='pl' # a dataset within a survey, pl - redistricting data 
        state='25' # state code 
        spinner.stop()
        print()

        spinner = Spinner("Connecting to MongoDB...")
        spinner.start()
        client = MongoClient(secret.MONGO_URI_NAACP)
        db = client['ML_Data']
        spinner.stop()
        print()

        spinner = Spinner("Fetching Heirarchal Mappings...")
        spinner.start()
        heir_data = load_heirarch_data(db)
        spinner.stop()
        print()
        
        spinner = Spinner("Detecting & Making local directories...")
        spinner.start()
        spinner.stop()
        llm_model_directory_path = "./llm_models"
        if (not os.path.exists(llm_model_directory_path)):
            print(f"No {llm_model_directory_path}! Creating...")
            os.makedirs(llm_model_directory_path)
            spinner = Spinner("Fetching models...")
            spinner.start()
            fetch_llm_models()
            spinner.stop()
        else:
            print(f"Found! {llm_model_directory_path}!")
            spinner = Spinner("Validating model files...")
            spinner.start()
            if (not os.path.isfile("./llm_models/BERTopic_CPU_M1")):
                spinner.stop()
                spinner = Spinner("Model file not found! Pulling models...")
                spinner.start()
                fetch_llm_models()
                spinner.stop()
            else:
                spinner.stop()
                
        print("Model files successfully validated.\n")
        spinner = Spinner("Instantiating classes...")
        spinner.start()
        drop_geos = geography(
            db=db
        )
        mappings = neighborhood_mapping(
            db=db
        )
        census_base = census_data(
            db=db
        )
        saved_geocodes = drop_geos.saved_geocodes 
        spinner.stop()
        print()
        
        spinner = Spinner("Loading Models...")
        spinner.start()
        nlp_ner = load_bert_NER()
        nlp_topic = load_bert_TOPIC()
        spinner.stop()
        
        print("\nBootstrap complete!\n")
        return year, dsource, dname, state, drop_geos, mappings, census_base, heir_data, saved_geocodes, nlp_ner, nlp_topic, db
    except Exception as e:
        spinner.err()
        print(f"Bootstrap Failed!!!\nFatal Error:{e}")
        raise Exception("Fatal Error in Bootstrapping ML Pipeline. Exiting...")
        
    return

In [25]:
def validate_bootstrap(year, dsource, dname, state, drop_geos, mappings, census_base, heir_data, saved_geocodes, nlp_ner, nlp_topic, db):
    try:
        spinner = Spinner("Validating Bootstrap variables...")
        spinner.start()

        variable_manifest = {
            "year":year, 
            "dsource": dsource, 
            "dname": dname, 
            "state": state,
            "drop_geos": drop_geos,
            "mappings": mappings,
            "census_base": census_base,
            "heir_data": heir_data,
            "saved_geocodes": saved_geocodes, 
            "nlp_ner": nlp_ner, 
            "nlp_topic": nlp_topic,
            "db": db
        }
        
        for var in variable_manifest.keys():
            if (variable_manifest[var] == None):
                raise Exception(f"{var} returned None!. Exiting...")
        spinner.stop()
        print()
        print("Validation Complete! Everything seems to be in order.")
    except Exception as e:
        spinner.err()
        print(f"Bootstrap Validation Failed!!!\nFatal Error:{e}")
        raise Exception("Fatal Error in Bootstrapping Validation. Exiting...")
        

# Entity Recognition

Ideally, we should only be concerned with articles that yield a recognition of location. If not, we chuck it out!.

I think we should append it to the end of the df to make a cleaner since this needs to be passed to topic modeling.

In [26]:
#Bootstrap Run
year, dsource, dname, state, drop_geos, mappings, census_base, heir_data, saved_geocodes, nlp_ner, nlp_topic, db = bootstrap_pipeline()

 ______     __  __        ______     ______   ______     ______     __  __    
/\  == \   /\ \/\ \      /\  ___\   /\  == \ /\  __ \   /\  == \   /\ \/ /    
\ \  __<   \ \ \_\ \     \ \___  \  \ \  _-/ \ \  __ \  \ \  __<   \ \  _"-.  
 \ \_____\  \ \_____\     \/\_____\  \ \_\    \ \_\ \_\  \ \_\ \_\  \ \_\ \_\ 
  \/_____/   \/_____/      \/_____/   \/_/     \/_/\/_/   \/_/ /_/   \/_/\/_/ 
                                                                              

Bootstrapping pipeline... (This should only run once!)

Setting up variables... ✔️ OK

Connecting to MongoDB... ✔️ OK

Fetching Heirarchal Mappings... ✔️ OK

Detecting & Making local directories... ✔️ OK
Found! ./llm_models!
Validating model files... ✔️ OK
Model files successfully validated.

Instantiating classes... ✔️ OK

Loading Models... \

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✔️ OK

Bootstrap complete!



In [27]:
validate_bootstrap(year, dsource, dname, state, drop_geos, mappings, census_base, heir_data, saved_geocodes, nlp_ner, nlp_topic, db)

Validating Bootstrap variables... ✔️ OK

Validation Complete! Everything seems to be in order.


### Single Stream Cocurrent Processing

In [28]:
def process_data(chunk, df, data_schema, data_packaging_scheme, nlp_ner):
    """
    Processes a chunk of indices from the given dataset.
    
    Parameters
    ----
    df: The pandas dataframe that entity recognition is being done on.
    chunk: The chunk of indices to proceses in the given df.

    Returns
    ---- 
    A list of dictionary items that constitute data for an article.
    """
    ignore_article_types = ["National News", "International News", "Programs", "Digital Mural", "Jazz", "Celtic"]
    
    dataset_df = data_schema
    neighborhoods = set()
    census_tracts = set()
    try: 
        for idx in tqdm(chunk, desc='Processing Entity Recognition'):
            # Maybe nested 'try:' are cursed
            try:
                if df['Section'][idx] not in ignore_article_types and df['Type'][idx] == 'Article':
                    headline = str(df['Label'][idx])
                    text = str(df['Body'][idx])
                    
                    sentences = get_sentences(text)
            
                    # # get lede first 5 sentences, can change the number of sentences
                    text_5 = get_snippet(sentences, 5)
                    text_10 = get_snippet(sentences, 5, False) # get sentences 5-10
                    text_remain = get_snippet(sentences, 5, False, True)
            
                    # get entities, returns existing entities that have been seen before and new entities as sets 
                    check_order = [
                        (run_entity_recognition(headline, nlp_ner, drop_geos, saved_geocodes), "headline"), 
                        (run_entity_recognition(text_5, nlp_ner, drop_geos, saved_geocodes), "first 5 sentences"), 
                        (run_entity_recognition(text_10, nlp_ner, drop_geos, saved_geocodes), "next 5 sentences"),
                        (run_entity_recognition(text_remain, nlp_ner, drop_geos, saved_geocodes), "remaining text")
                    ]
            
                    for (entities, method) in check_order:
                        check_text, location_geocode, existing_loc_geocode = check_snippets(secret.API_KEY, entities[1], entities[0])
                        if not check_text:
                            break 
        
                    # No Census tracts we want is detected
                    if (len(existing_loc_geocode) == 0 and len(location_geocode) == 0):
                        continue
                    
                    pipeline_output = run_pipeline(year, dsource, dname, state, existing_loc_geocode, location_geocode, mappings)
        
                    if (pipeline_output):
                        for output in pipeline_output:
                            if ('neighborhood' in output[list(output.keys())[0]] and 'tract' in output[list(output.keys())[0]]):
                                neighborhood = output[list(output.keys())[0]]['neighborhood']
                                census_tract = output[list(output.keys())[0]]['tract']
                                neighborhoods.add(neighborhood)
                                census_tracts.add(census_tract)
                            else:
                                print("Skipped an entry!")
                                continue
                    
                    # If we have valid entity recognition | We have both some neighborhoods and census tracts
                    if (len(neighborhoods) != 0 and len(census_tracts) != 0):
                        data_packaging_scheme(
                            dataset_df, # This is the data scheme we are using
                            df['Tagging'][idx],
                            list(neighborhoods),
                            df['Section'][idx],
                            list(census_tracts),
                            df['Byline'][idx],
                            df['Body'][idx],
                            df['Tagging'][idx],
                            df['Label'][idx],
                            df['Headline'][idx],
                            df['Publish Date'][idx],
                            "GBH", # I hard coded this as we have one main client
                            df['Paths'][idx],
                            # No Open AI Labels yet
                            method,
                            existing_loc_geocode | location_geocode
                        )
                    neighborhoods.clear()
                    census_tracts.clear()
            except Exception as e: # Loop inbounded error
                print(f"[Error] process_data() ran into an error! Continuing... \n[Raw Error]: {e}")
        ## Convert to Pandas dataframe...
        new_df = pd.DataFrame(dataset_df)
        return new_df
    except Exception as e: 
        print(f"[Fatal Error] process_data() ran into an Error! Data is not saved!\nRaw Error:{e}")

    return


In [29]:
import inspect

# To run the pipeline, two things we need to have defined is the data_schmea and data packing func
data_schema = {
        "id": [],
        "neighborhoods": [],
        "position_section": [],
        "tracts": [],
        "author": [],
        "body": [],
        "content_id": [],
        "hl1": [],
        "hl2": [],
        "pub_date": [],
        "pub_name": [],
        "link": [],
        "method": [],
        "ent_geocodes": []
    }

# Data packing scheme, function attributes must be len(data_schema) + 1
# Ideally, the developer should provide a function to figure out how to pack the data based on their needs
def package_data_to_dict(
    data_schema, 
    id,
    neighborhoods,
    position_section,
    tracts,
    author,
    body,
    content_id,
    hl1,
    hl2,
    pub_date,
    pub_name,
    link,
    method,
    ent_geocodes
):
    try:
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        args.pop(0) # we pop off data_schema
        for arg in args:
            data_schema[arg].append(values[arg])
    except KeyError as ke:
        print("Key not found in data schema!")
        print(f"Raw Error: {ke}")
        
    return data_schema

#new_df = process_data(list(range(1500, 1550)), df, data_schema, package_data_to_dict, nlp_ner)
new_df = process_data(list(range(1500, 1600)), df_test, data_schema, package_data_to_dict, nlp_ner)

Processing Entity Recognition:  22%|█████████▉                                   | 22/100 [00:35<02:33,  1.96s/it]

Expecting value: line 1 column 1 (char 0)
Unable to get census demographics for: New


Processing Entity Recognition:  39%|█████████████████▌                           | 39/100 [01:09<02:22,  2.33s/it]

Expecting value: line 1 column 1 (char 0)
Unable to get census demographics for: Department of Health


Processing Entity Recognition:  96%|███████████████████████████████████████████▏ | 96/100 [02:36<00:05,  1.45s/it]

Expecting value: line 1 column 1 (char 0)
Unable to get census demographics for: MIT Mechanical Engineering Department


Processing Entity Recognition: 100%|████████████████████████████████████████████| 100/100 [02:43<00:00,  1.63s/it]


In [30]:
new_df

Unnamed: 0,id,neighborhoods,position_section,tracts,author,body,content_id,hl1,hl2,pub_date,pub_name,link,method,ent_geocodes
0,00000176-f2c0-dd07-a17e-f6ecb91c0001,"[Fenway, Roxbury, Downtown]",Local News,"[080300, 010103, 030302]",Aidan Connelly,Today on <i>Boston Public Radio</i>:<br/><br/>...,00000176-f2c0-dd07-a17e-f6ecb91c0001,Boston Public Radio Full Show: 1/11/21,Boston Public Radio Full Show: 1/11/21,Mon Jan 11 16:20:19 EST 2021,GBH,/local-news/2021/01/11/boston-public-radio-ful...,remaining text,"{'Boston City Council': {'lat': 42.3603082, 'l..."
1,00000176-f30a-d4ee-a5fe-f74fc94e0001,[Downtown],Local News,[030302],Carrie Saldo,Staff and inmates inside county jails and stat...,00000176-f30a-d4ee-a5fe-f74fc94e0001,Next Up For The Vaccine In Massachusetts: Corr...,Next Up For The Vaccine In Massachusetts: Corr...,Tue Jan 12 09:26:22 EST 2021,GBH,/local-news/2021/01/12/next-up-for-the-vaccine...,next 5 sentences,{'Department of Public Health': {'lat': 42.358...
2,00000176-f371-dd07-a17e-f7fd93360001,[South Boston],Local News,[090901],"Arun Rath, Amanda Beland","Nearly 13,000 people have died from COVID-19 i...",00000176-f371-dd07-a17e-f7fd93360001,"Coronavirus Doesn't Choose Based On Age, Race ...","Coronavirus Doesn't Choose Based On Age, Race ...",Mon Jan 11 19:36:20 EST 2021,GBH,/local-news/2021/01/11/coronavirus-doesnt-choo...,first 5 sentences,"{'UMass': {'lat': 42.3141992, 'lon': -71.04199..."
3,00000176-f407-d79b-abfe-fdb7b3e70001,[Longwood],Local News,[081001],Greater Boston Staff,"Massachusetts averaged more than 6,000 new cor...",00000176-f407-d79b-abfe-fdb7b3e70001,Another Lockdown May Be Ahead Amid Massachuset...,Another Lockdown May Be Ahead Amid Massachuset...,Mon Jan 11 19:54:56 EST 2021,GBH,/local-news/2021/01/11/is-another-lockdown-ahe...,first 5 sentences,{'Brigham and Women ' s Hospital': {'lat': 42....
4,00000176-f42a-d79b-abfe-fdbe6aad0001,[Downtown],Local News,[030302],Mark Herz,A new more highly contagious strain of the cor...,00000176-f42a-d79b-abfe-fdbe6aad0001,New More Contagious Coronavirus Likely In Mass...,New More Contagious Coronavirus Likely In Mass...,Mon Jan 11 21:53:38 EST 2021,GBH,/local-news/2021/01/11/new-more-contagious-cor...,first 5 sentences,{'Massachusetts Department of Public Health': ...
5,00000176-f434-d4ee-a5fe-f477581d0001,[South Boston],Local News,[070104],Tori Bedford,UMass Memorial Health Care employee Therese Du...,00000176-f434-d4ee-a5fe-f477581d0001,"Yes, Anyone Can Be Fired For Taking Part In A ...","Yes, Anyone Can Be Fired For Taking Part In A ...",Mon Jan 11 20:58:40 EST 2021,GBH,/local-news/2021/01/11/yes-anyone-can-be-fired...,first 5 sentences,{'Lawyers for Civil Rights': {'lat': 42.357071...
6,00000176-f737-dd07-a17e-f7ff8d520001,[Downtown],Politics,[981700],Joe Mathieu,<i>There are several dozen bills sitting on Go...,00000176-f737-dd07-a17e-f7ff8d520001,Beacon Hill Waits For Gov. Baker On Landmark C...,Beacon Hill Waits For Gov. Baker On Landmark C...,Tue Jan 12 11:31:44 EST 2021,GBH,/politics/2021/01/12/beacon-hill-waits-for-gov...,headline,"{'Beacon Hill': {'lat': 42.3561948, 'lon': -71..."
7,00000176-f812-d1d4-a57f-fc5a5ac50001,[Downtown],Local News,[030302],Aidan Connelly,Today on <i>Boston Public Radio</i>:<br/><br/>...,00000176-f812-d1d4-a57f-fc5a5ac50001,Boston Public Radio Full Show: 1/12/21,Boston Public Radio Full Show: 1/12/21,Tue Jan 12 16:00:11 EST 2021,GBH,/local-news/2021/01/12/boston-public-radio-ful...,first 5 sentences,"{'NBC Sports Boston': {'lat': 42.3600825, 'lon..."
8,00000176-f814-dd07-a17e-fcfc8c5b0001,"[South Boston, Beacon Hill]",Local News,"[020301, 090901]",Gabrielle Emanuel,"When COVID first arrived in the U.S., Jodee Pi...",00000176-f814-dd07-a17e-fcfc8c5b0001,Can COVID Long-Haulers Access Disability Benef...,Can COVID Long-Haulers Access Disability Benef...,Wed Jan 13 13:16:56 EST 2021,GBH,/local-news/2021/01/13/can-covid-long-haulers-...,remaining text,"{'Mass General Hospital': {'lat': 42.3625677, ..."
9,00000176-f859-d1d4-a57f-fc592fbd0001,[South Boston],Education,[090901],Katie Lannan,A nine-year dean of the University of Massachu...,00000176-f859-d1d4-a57f-fc592fbd0001,Former Biz School Dean Picked To Lead UMass-Da...,Former Biz School Dean Picked To Lead UMass-Da...,Tue Jan 12 15:49:51 EST 2021,GBH,/education/2021/01/12/former-biz-school-dean-p...,first 5 sentences,"{'UMass': {'lat': 42.3141992, 'lon': -71.04199..."


# Topic Modeling

In [31]:
f = open('./openai_label_from_taxonomy_structured_230.json') # Openai label json dir
openai_labels = json.load(f)

topic_df = nlp_topic.get_topic_info()

In [32]:
unseen_articles = new_df
unseen_articles

Unnamed: 0,id,neighborhoods,position_section,tracts,author,body,content_id,hl1,hl2,pub_date,pub_name,link,method,ent_geocodes
0,00000176-f2c0-dd07-a17e-f6ecb91c0001,"[Fenway, Roxbury, Downtown]",Local News,"[080300, 010103, 030302]",Aidan Connelly,Today on <i>Boston Public Radio</i>:<br/><br/>...,00000176-f2c0-dd07-a17e-f6ecb91c0001,Boston Public Radio Full Show: 1/11/21,Boston Public Radio Full Show: 1/11/21,Mon Jan 11 16:20:19 EST 2021,GBH,/local-news/2021/01/11/boston-public-radio-ful...,remaining text,"{'Boston City Council': {'lat': 42.3603082, 'l..."
1,00000176-f30a-d4ee-a5fe-f74fc94e0001,[Downtown],Local News,[030302],Carrie Saldo,Staff and inmates inside county jails and stat...,00000176-f30a-d4ee-a5fe-f74fc94e0001,Next Up For The Vaccine In Massachusetts: Corr...,Next Up For The Vaccine In Massachusetts: Corr...,Tue Jan 12 09:26:22 EST 2021,GBH,/local-news/2021/01/12/next-up-for-the-vaccine...,next 5 sentences,{'Department of Public Health': {'lat': 42.358...
2,00000176-f371-dd07-a17e-f7fd93360001,[South Boston],Local News,[090901],"Arun Rath, Amanda Beland","Nearly 13,000 people have died from COVID-19 i...",00000176-f371-dd07-a17e-f7fd93360001,"Coronavirus Doesn't Choose Based On Age, Race ...","Coronavirus Doesn't Choose Based On Age, Race ...",Mon Jan 11 19:36:20 EST 2021,GBH,/local-news/2021/01/11/coronavirus-doesnt-choo...,first 5 sentences,"{'UMass': {'lat': 42.3141992, 'lon': -71.04199..."
3,00000176-f407-d79b-abfe-fdb7b3e70001,[Longwood],Local News,[081001],Greater Boston Staff,"Massachusetts averaged more than 6,000 new cor...",00000176-f407-d79b-abfe-fdb7b3e70001,Another Lockdown May Be Ahead Amid Massachuset...,Another Lockdown May Be Ahead Amid Massachuset...,Mon Jan 11 19:54:56 EST 2021,GBH,/local-news/2021/01/11/is-another-lockdown-ahe...,first 5 sentences,{'Brigham and Women ' s Hospital': {'lat': 42....
4,00000176-f42a-d79b-abfe-fdbe6aad0001,[Downtown],Local News,[030302],Mark Herz,A new more highly contagious strain of the cor...,00000176-f42a-d79b-abfe-fdbe6aad0001,New More Contagious Coronavirus Likely In Mass...,New More Contagious Coronavirus Likely In Mass...,Mon Jan 11 21:53:38 EST 2021,GBH,/local-news/2021/01/11/new-more-contagious-cor...,first 5 sentences,{'Massachusetts Department of Public Health': ...
5,00000176-f434-d4ee-a5fe-f477581d0001,[South Boston],Local News,[070104],Tori Bedford,UMass Memorial Health Care employee Therese Du...,00000176-f434-d4ee-a5fe-f477581d0001,"Yes, Anyone Can Be Fired For Taking Part In A ...","Yes, Anyone Can Be Fired For Taking Part In A ...",Mon Jan 11 20:58:40 EST 2021,GBH,/local-news/2021/01/11/yes-anyone-can-be-fired...,first 5 sentences,{'Lawyers for Civil Rights': {'lat': 42.357071...
6,00000176-f737-dd07-a17e-f7ff8d520001,[Downtown],Politics,[981700],Joe Mathieu,<i>There are several dozen bills sitting on Go...,00000176-f737-dd07-a17e-f7ff8d520001,Beacon Hill Waits For Gov. Baker On Landmark C...,Beacon Hill Waits For Gov. Baker On Landmark C...,Tue Jan 12 11:31:44 EST 2021,GBH,/politics/2021/01/12/beacon-hill-waits-for-gov...,headline,"{'Beacon Hill': {'lat': 42.3561948, 'lon': -71..."
7,00000176-f812-d1d4-a57f-fc5a5ac50001,[Downtown],Local News,[030302],Aidan Connelly,Today on <i>Boston Public Radio</i>:<br/><br/>...,00000176-f812-d1d4-a57f-fc5a5ac50001,Boston Public Radio Full Show: 1/12/21,Boston Public Radio Full Show: 1/12/21,Tue Jan 12 16:00:11 EST 2021,GBH,/local-news/2021/01/12/boston-public-radio-ful...,first 5 sentences,"{'NBC Sports Boston': {'lat': 42.3600825, 'lon..."
8,00000176-f814-dd07-a17e-fcfc8c5b0001,"[South Boston, Beacon Hill]",Local News,"[020301, 090901]",Gabrielle Emanuel,"When COVID first arrived in the U.S., Jodee Pi...",00000176-f814-dd07-a17e-fcfc8c5b0001,Can COVID Long-Haulers Access Disability Benef...,Can COVID Long-Haulers Access Disability Benef...,Wed Jan 13 13:16:56 EST 2021,GBH,/local-news/2021/01/13/can-covid-long-haulers-...,remaining text,"{'Mass General Hospital': {'lat': 42.3625677, ..."
9,00000176-f859-d1d4-a57f-fc592fbd0001,[South Boston],Education,[090901],Katie Lannan,A nine-year dean of the University of Massachu...,00000176-f859-d1d4-a57f-fc592fbd0001,Former Biz School Dean Picked To Lead UMass-Da...,Former Biz School Dean Picked To Lead UMass-Da...,Tue Jan 12 15:49:51 EST 2021,GBH,/education/2021/01/12/former-biz-school-dean-p...,first 5 sentences,"{'UMass': {'lat': 42.3141992, 'lon': -71.04199..."


In [33]:
unseen_articles = unseen_articles.dropna(subset=['content_id'])
topics, probs = nlp_topic.transform(unseen_articles['body']) # get bertopics for each article
unseen_articles['bertopic_topic_label'] = topics

In [34]:
# add open ai label to bglobe dataframe in new column
unseen_label_name = [openai_labels[unseen_articles['bertopic_topic_label'][i]]['openai'] 
              if int(unseen_articles['bertopic_topic_label'][i]) != -1 else "" for i in range(len(unseen_articles))]
unseen_articles['openai_label'] = unseen_label_name

This will be the raw data that is saved in the ML data base for data analysis by the data science team. Naturally, we will strip the data that will allow the back-end to consume it.

### Some User ID Stuff & Upload ID hashing

In [35]:
# We predetermine the userID and upload ID
import uuid

# Generate a unique UUID
unique_id = str(uuid.uuid4())
userID = "1"

print(unique_id)

1f879dcf-c256-40b5-bcf0-752e55920364


In [36]:
unseen_articles["userID"] = userID
unseen_articles["uploadID"] = unique_id

In [37]:
unseen_articles.head(2)

Unnamed: 0,id,neighborhoods,position_section,tracts,author,body,content_id,hl1,hl2,pub_date,pub_name,link,method,ent_geocodes,bertopic_topic_label,openai_label,userID,uploadID
0,00000176-f2c0-dd07-a17e-f6ecb91c0001,"[Fenway, Roxbury, Downtown]",Local News,"[080300, 010103, 030302]",Aidan Connelly,Today on <i>Boston Public Radio</i>:<br/><br/>...,00000176-f2c0-dd07-a17e-f6ecb91c0001,Boston Public Radio Full Show: 1/11/21,Boston Public Radio Full Show: 1/11/21,Mon Jan 11 16:20:19 EST 2021,GBH,/local-news/2021/01/11/boston-public-radio-ful...,remaining text,"{'Boston City Council': {'lat': 42.3603082, 'l...",-1,,1,1f879dcf-c256-40b5-bcf0-752e55920364
1,00000176-f30a-d4ee-a5fe-f74fc94e0001,[Downtown],Local News,[030302],Carrie Saldo,Staff and inmates inside county jails and stat...,00000176-f30a-d4ee-a5fe-f74fc94e0001,Next Up For The Vaccine In Massachusetts: Corr...,Next Up For The Vaccine In Massachusetts: Corr...,Tue Jan 12 09:26:22 EST 2021,GBH,/local-news/2021/01/12/next-up-for-the-vaccine...,next 5 sentences,{'Department of Public Health': {'lat': 42.358...,-1,,1,1f879dcf-c256-40b5-bcf0-752e55920364


### Format for GraphQL Database

In [38]:
packaged_data_df = unseen_articles.drop(columns=[
    'method',
    'ent_geocodes',
    'bertopic_topic_label'
])

### Force Reconneciton to MongoDB

In [58]:
def connectMongoDB():
    try:
        client = MongoClient(secret.MONGO_URI_NAACP)
        db = client['ML_Data']
        print("Reconnected to MongoDB!")
        return db
    except Exception as err:
        raise Exception("Fatal Error! Failed to Connect to MongoDB. [No retry implemented] [Data Saved Data Warehouse]")
    return 
    

In [59]:
db = connectMongoDB()

Reconnected to MongoDB!


In [39]:
#packaged_data_df

## Conversion to Articles

In [None]:
articles_collection_name = "articles_data"

In [None]:
packaged_data_df.set_index('id', inplace=True)

In [None]:
article_dict = packaged_data_df.T.to_dict('dict')

In [None]:
def string_to_list(s):
    if(s != ''):
        return [s]
    else:
        return []  # Return the string as a single-element list

In [None]:
article_payload = []

for article_key in article_dict.keys():
    article = article_dict[article_key]
    if ('openai_label' not in article):
        article["openai_label"] = []
    else:
        article["openai_label"] = string_to_list(article["openai_label"])
    article_payload.append(article)

In [None]:
len(article_payload)

In [None]:
# Check for existence of collection
collection_list = db.list_collection_names()
if articles_collection_name not in collection_list:
    db.create_collection(articles_collection_name)
    print(f"Collection '{articles_collection_name}' created.")

In [None]:
insertion_result = db[articles_collection_name].insert_many(article_payload)

print("Documents have been successfully inserted with the following IDs:")
print(insertion_result.inserted_ids)

## Conversion to Neighborhood 

In [None]:
neigh_collection_name = "neighborhood_data"
neigh_collection = db[neigh_collection_name]

In [None]:
# Check for existence of collection
collection_list = db.list_collection_names()
if neigh_collection_name not in collection_list:
    db.create_collection(neigh_collection_name)
    print(f"Collection '{neigh_collection_name}' created.")

In [None]:
# Assume that the neighborhood list we get is well ordered
# We also assume that the lengths between each list is the same
neighborhood_list = packaged_data_df['neighborhoods'].to_numpy()
tagging_list = packaged_data_df['content_id'].to_numpy()

for n_idx in range(len(neighborhood_list)):
    neigh_list = neighborhood_list[n_idx]
    
    for neigh in neigh_list:
        # Here we update the tags/articles by neighborhood
        neigh_collection.find_one_and_update(
            {'value': neigh},
            {'$addToSet': {'articles': tagging_list[n_idx]}},
            upsert = True # Creates a new document of it if it doesn't exist
        )       

## Conversion to Topics 

In [None]:
topics_collection_name = "topics_data"
topic_collection = db[topics_collection_name]

In [None]:
# Check for existence of collection
collection_list = db.list_collection_names()

if topics_collection_name not in collection_list:
    db.create_collection(topics_collection_name)
    print(f"Collection '{topics_collection_name}' created.")

In [None]:
# Assume that the topics list we get is well ordered
# We also assume that the lengths between each list is the same
topics_list = packaged_data_df['position_section'].to_numpy()
tagging_list = packaged_data_df['content_id'].to_numpy()

for n_idx in range(len(topics_list)):
    topic = topics_list[n_idx]
    
    # Here we update the tags/articles by Topics
    topic_collection.find_one_and_update(
        {'value': topic},
        {'$addToSet': {'articles': tagging_list[n_idx]}},
        upsert = True # Creates a new document of it if it doesn't exist
    )       

## Conversion to Tracts

In [63]:
def addExistingTracts(tract_collection):
    tracts_list = []

    for tract_key in census_base.old_census_tracts['tracts_filter'].keys():
        tracts_list.append(census_base.old_census_tracts['tracts_filter'][tract_key])

    tract_collection.insert_many(tracts_list)

In [52]:
tract_collection_name = "tracts_data"
tract_collection = db[tract_collection_name]

In [62]:
# Check for existence of collection
collection_list = db.list_collection_names()

if tract_collection_name not in collection_list:
    db.create_collection(tract_collection_name)
    print(f"Collection '{tract_collection_name}' created.")
    # We add existing tracts from the old census tracts
    addExistingTracts(tract_collection)

In [66]:
# Assume that the topics list we get is well ordered
# We also assume that the lengths between each list is the same
tracts_lists = packaged_data_df['tracts'].to_numpy()
tagging_list = packaged_data_df['content_id'].to_numpy()

for n_idx in range(len(tracts_lists)):
    tract_list = tracts_lists[n_idx]

    for tract in tract_list:
        # Here we update the tags/articles by tracts
        if tract_collection.find_one({'tract': tract}):
            tract_collection.find_one_and_update(
                {'tract': tract},
                {'$addToSet': {'articles': tagging_list[n_idx]}},
            )
        else: # We didn't find one and we have to label it as unknown
            unknown_tract = {
                'tract': tract,
                'neighborhood': "unknown",
                'articles': []
            }
            tract_collection.insert_one(unknown_tract)