In [None]:
import re
import json
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()

## Llama 7B

In [None]:
from langchain.chains import LLMChain

# llm
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

# Prompt
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

# Parser
from langchain_core.output_parsers import StrOutputParser

In [None]:
prompt = PromptTemplate(
    input_variables=["headline", "body"],
    template="""<<SYS>> \n You are an assistant tasked in geo-locating \
this news article. \n <</SYS>> \n\n [INST] Generate a SHORT response \
of where you think this article is talking about. BE SPECIFIC AS POSSIBLE. IT IS IMPERATIVE THAT YOU HIGHLIGHT THE MOST SPECIFIC LOCATION. Give your response in the following format: \
1.Y/N indicating whether the article is talking about a region of Boston. \n 2.The specific location within the city you got if you got Y in the first question. \
3. The involved specific locations or organizations EXPLICITLY FOUND WITHIN THE ARTICLE that influenced your decision. \
If you do not know, PLEASE GIVE THE BEST GUESS AS POSSIBLE. \n\n
Headline: \n\n {headline} \n\n Body: \n\n {body} \n\n [/INST]""",
)

# prompt = PromptTemplate(
#     input_variables=["headline"],
#     template="""<<SYS>> \n You are an assistant tasked in geo-locating \
# this news article. \n <</SYS>> \n\n [INST] Generate a SHORT response \
# of where you think this article is talking about. BE SPECIFIC AND CONCISE AS POSSIBLE. IT IS IMPERATIVE THAT YOU HIGHLIGHT THE MOST SPECIFIC LOCATION. PLEASE CONSIDER THE CONTEXT OF THE ARTICLE. Give your response in the following format: \
# 1. A very brief summary of what the article is talking about. \n 2.The specific location you chose based on the context of the article. \
# If you do not know, PLEASE GIVE THE BEST GUESS AS POSSIBLE. \n\n
# Headline: \n\n {headline} \n\n [/INST]""",
# )

# prompt = PromptTemplate(
#     input_variables=["headline"],
#     template="""<<SYS>> \n You are an assistant tasked in geo-locating \
# this news article. \n <</SYS>> \n\n [INST] Generate a SHORT response \
# of where you think this article is talking about. BE SPECIFIC AND CONCISE AS POSSIBLE. IT IS IMPERATIVE THAT YOU HIGHLIGHT THE MOST SPECIFIC LOCATION. Give your response in the following format: \
# 1.Y/N indicating whether the article is talking about a region of Boston \n 2.The specific location within the city you got if you got Y in the first question. \
# If you do not know, PLEASE GIVE THE BEST GUESS AS POSSIBLE. PLEASE KEEP YOUR ANSWER SHORT. \n\n
# Headline: \n\n {headline} \n\n Body: \n\n {body}  \n\n [/INST]""",
# )

In [None]:
llama_model_path = "./models/llama_7B/llama-2-7b-chat.Q4_K_M.gguf"

In [None]:
llm = LlamaCpp(
    model_path=llama_model_path,
    n_gpu_layers=1,
    n_batch=1024,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)
output_parser = StrOutputParser()

In [None]:
chain = prompt | llm | output_parser

## NER Model

In [None]:
import spacy
from span_marker import SpanMarkerModel

In [None]:
# Load the spacy model with the span_marker pipeline component
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
nlp.add_pipe("span_marker", config={"model": "tomaarsen/span-marker-roberta-large-ontonotes5"})

## Google Maps

In [None]:
import ast
import requests
import googlemaps
from mapbox import Geocoder

In [None]:
gmap_client_key = "YOUR_KEY_HERE"
gmaps = googlemaps.Client(key=gmap_client_key)

## Pipeline Entry Point

In [None]:
sample_data_dir = "./sample_data/se_naacp_db.articles_data.csv"

In [None]:
raw_df = pd.read_csv(sample_data_dir)

In [None]:
raw_df.columns

The ML Model honestly just needs the `id`, `header`, and `body`.

In [None]:
df = pd.concat([raw_df['_id'], raw_df['hl1'], raw_df['body']], axis=1)

In [None]:
# For Testing Purposes Only
# df = df[:20]

In [None]:
df["llama_prediction"] = None # Add the llama_prediction

Remove Duplicates (if any)

In [None]:
duplicates = df.duplicated(subset=['hl1'])

In [None]:
print(duplicates.value_counts())

In [None]:
df = df.drop_duplicates(subset=['hl1'])

Clean the HTML in the body and header -> Regex Cleaner

In [None]:
func_clean_html = lambda x: BeautifulSoup(x, "html.parser").get_text()
df['body'] = df['body'].progress_apply(func_clean_html)
df['hl1'] = df['hl1'].progress_apply(func_clean_html)

In [None]:
func_clean_regex = lambda x: ' '.join([item for item in re.findall(r'[A-Za-z0-9!@#$%^&*().]+', x) if len(item) > 1])
df['body'] = df['body'].progress_apply(func_clean_regex)
df['hl1'] = df['hl1'].progress_apply(func_clean_regex)

### Explicit Article Mentions

Load the well-known locations, organizations, and neighborhoods dictionary

In [None]:
def explicit_filtering(header):
    known_locs_path = "./geodata/known_locs.json"
    with open(known_locs_path, 'r') as file:
        known_locs_dict = json.load(file)
        
    lowercase_header = header.lower()
    for key in known_locs_dict.keys():
        if (key in lowercase_header):
            return [key, known_locs_dict[key]]   
    return None

In [None]:
df["Explicit_Pass_1"] = df["hl1"].progress_apply(explicit_filtering)

In [None]:
df

### NER Code First Pass

In [None]:
predict_NER = lambda x: [(entity, entity.label_) for entity in nlp(x).ents] if (x != None and x != "") else None
def predict_NER_def(x):
    try:
        return predict_NER(x)
    except Exception as e:
        print(e)
        return None

In [None]:
def explicit_filtering_NER(col):
    try:
        if (col['Explicit_Pass_1'] != None): # We already found an explicit mention in the title
            print(f"Passed on {col['hl1']}")
            return None
        else:
            return predict_NER_def(col['body'])
    except Exception as e:
        print(e)
        return None

In [None]:
df['NER_Pass_1'] = df.progress_apply(explicit_filtering_NER, axis=1)

Filter based on superb specific places such as 'FAC'.

In [None]:
def filter_loc_explicit(x):
    if (x == None):
        return None
    res = []
    for tup in x:
        if (len(tup) >= 2): 
            if (("GPE" in tup[1] and "Boston" not in tup[0] and "Massachusetts" not in tup[0])
                or ("ORG" in tup[1]) 
                or ("FAC" in tup[1])
                or ("LOC" in tup[1])
            ):
                res.append((tup[0], tup[1].strip()))
    priority = {'FAC': 1, 'ORG': 2, 'LOC': 3, 'GPE': 4}
    sorted_list = sorted(res, key=lambda x: priority[x[1]])
    
    return sorted_list

In [None]:
df['NER_Pass_1_Sorted'] = df['NER_Pass_1'].progress_apply(filter_loc_explicit)

Then we get the coordinates throught the first pass

In [None]:
def getLongLatsForFAC(x):
    if (x == None or len(x) == 0):
        return None  
    
    location = x[0][0] # (Location, Label)
    if (x[0][1] == "FAC" or "Boston" in location): # Check if we have Boston + 'FAC' Label
        response = gmaps.geocode(f"{location}, Boston")
    elif(x[0][1] == "FAC"): # Check if we have 'FAC' Label
        response = gmaps.geocode(f"{location}, Massachusetts")
    else:
        return None # Doesn't have 'FAC' Label
        
    if (len(response) == 0):
        return None  
    latitude = response[0]['geometry']['location']['lat']
    longitude = response[0]['geometry']['location']['lng']
    
    return [longitude, latitude]

In [None]:
df['NER_Pass_1_Coordinates'] = df['NER_Pass_1_Sorted'].progress_apply(getLongLatsForFAC)

In [None]:
df

### Llama Prediction

In [None]:
def predict_llama(col):
    try:
        if (col['Explicit_Pass_1'] != None or col['NER_Pass_1_Coordinates'] != None): # We already found an explicit mention in the previous passes
            print(f"Passed on {col['hl1']}")
            return None
        else:
            return chain.invoke({"headline": col['hl1'], "body": col['body']})
    except Exception as e:
        print(e)
        return None

In [None]:
df['llama_prediction'] = df.progress_apply(predict_llama, axis=1)

In [None]:
df

We then apply NER on the llama outputs

In [None]:
df['NER_prediction'] = df['llama_prediction'].progress_apply(predict_NER_def)

Sort the NER Predictions

In [None]:
def remove_first_comma(x):
    if (x[:1] == ","):
        return x[2:]
    else:
        return x

def format_NER(x):
    x = str(x)
    res = []
    if (x != None):
        input = x.replace("(","").replace("[","").replace("]","").replace("'","").split(")") 
        for word in input:
            res.append(remove_first_comma(word).strip())
    return res 

def filter_loc(x):
    res = []
    for tup in x:
        cleaned_tup = tup.strip().split(",")
        if (len(cleaned_tup) >= 2): 
            if (("GPE" in cleaned_tup[1] and "Boston" not in cleaned_tup[0] and "Massachusetts" not in cleaned_tup[0])
                or ("ORG" in cleaned_tup[1]) 
                or ("FAC" in cleaned_tup[1])
                or ("LOC" in cleaned_tup[1])
            ):
                res.append((cleaned_tup[0], cleaned_tup[1].strip()))
    priority = {'FAC': 1, 'ORG': 2, 'LOC': 3, 'GPE': 4}
    sorted_list = sorted(res, key=lambda x: priority[x[1]])
    
    return sorted_list

In [None]:
df['NER_Sorted'] = df['NER_prediction'].progress_apply(format_NER)
df['NER_Sorted'] = df['NER_Sorted'].progress_apply(filter_loc)

## Geocoding -> Pass it off to Topic Modeling

In [None]:
def getLongLats(x):
    if (len(x) == 0):
        return None  
        
    location = x[0][0] # (Location, Label)
    if (x[0][1] == "ORG" or x[0][1] == "FAC" or "Boston" in location):
        response = gmaps.geocode(f"{location}, Boston")
    else:
        response = gmaps.geocode(f"{location}, Massachusetts")
    if (len(response) == 0):
        return None  
    latitude = response[0]['geometry']['location']['lat']
    longitude = response[0]['geometry']['location']['lng']
    return [longitude, latitude]

In [None]:
df['NER_Sorted_Coordinates'] = df['NER_Sorted'].progress_apply(getLongLats)

In [None]:
def query_census_api(longitude, latitude):
    county = "NO COUNTY"
    url = f"https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={longitude}&y={latitude}&benchmark=Public_AR_Current&vintage=Census2020_Current&format=json"
    response = requests.get(url)
    if (response.status_code == 200):
        results = response.json()
        # print(results['result']['geographies'])
        census_tracts = results['result']['geographies'].get('Census Tracts', [])
        # county = results['result']['geographies']['County Subdivisions'][0].get('COUNTY')
        if (census_tracts):
            return census_tracts[0].get('TRACT', 'No TRACT found'), county # Returning the TRACT of the first census tract found
    return "No TRACT found", county  # Return this if API call failed or no tracts found

def getTractList(col):
    coordinates = []
    tract_list = [] # Initialize an empty list to store TRACT information  
    
    if (col['Explicit_Pass_1'] != None): # If we got the locations from the first explicit pass
        coordinates = col['Explicit_Pass_1'][1]
    elif(col['NER_Pass_1_Coordinates'] != None): # If we got locations from the very first NER pass (specific locs only)
        coordinates = col['NER_Pass_1_Coordinates']
    elif (col['NER_Sorted_Coordinates'] != None): # Finally, if we got locations from llama + NER pass
        coordinates = col['NER_Sorted_Coordinates']
    else: # Must be a very hard/bad article :-(
        return None 
        
    longitude = coordinates[0]
    latitude = coordinates[1]
    TRACT, COUNTY = query_census_api(longitude,latitude)
    tract_list.append(TRACT)
    
    return tract_list

In [None]:
df['Tracts'] = df.progress_apply(getTractList, axis=1)

In [None]:
df = df.dropna(subset=["Tracts"]) # Clean Those that doesn't have a Tract

In [None]:
df

## Topic Modeling

In [None]:
import os
import tiktoken
import numpy as np
from transformers import pipeline
from sklearn.metrics import adjusted_rand_score
from openai import OpenAI, AsyncOpenAI
from sklearn.metrics.pairwise import cosine_similarity
from tenacity import retry, wait_random_exponential, stop_after_attempt

## OpenAI Client

In [None]:
# Retry up to 10 times with exponential backoff, starting at 1 second and maxing out at 20 seconds delay
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def get_embedding(text: str, model="text-embedding-3-small"):
    #print(text)
    try:
        embedding = client.embeddings.create(input=text, model=model).data[0].embedding
        return embedding
    except Exception as e:
        print(f"Failed to retrieve ADA Embedding: {e}. Replacing with replacement value!")
        return [-1.0]
    return 

In [None]:
client = OpenAI(
    api_key='YOUR_KEY_HERE',
)

## Taxonomy Lists

Content Taxanomy

In [None]:
# Get the embedding for taxonomy
taxonomy_df = pd.read_csv('./taxonomy_list/Content_Taxonomy.csv', skiprows=5, usecols=range(8))
taxonomy_df.columns = taxonomy_df.iloc[0]
taxonomy_df = taxonomy_df.tail(-1)

tier_1_list = []
tier_2_list = []
tier_3_list = []
tier_4_list = []
for index, row in taxonomy_df.iterrows():
    if not pd.isnull(row['Tier 4']) and row['Tier 4'] != ' ':
        tier_1_label = row['Tier 1']
        tier_2_label = row['Tier 2']
        tier_3_label = row['Tier 3']
        tier_4_label = row['Tier 4']
        tier_4_list.append(f'{tier_1_label} - {tier_2_label} - {tier_3_label} - {tier_4_label}')
    elif not pd.isnull(row['Tier 3']) and row['Tier 3'] != ' ':
        tier_1_label = row['Tier 1']
        tier_2_label = row['Tier 2']
        tier_3_label = row['Tier 3']
        tier_3_list.append(f'{tier_1_label} - {tier_2_label} - {tier_3_label}')
    elif not pd.isnull(row['Tier 2']) and row['Tier 2'] != ' ':
        tier_1_label = row['Tier 1']
        tier_2_label = row['Tier 2']
        tier_2_list.append(f'{tier_1_label} - {tier_2_label}')
    else:
        tier_1_label = row['Tier 1']
        tier_1_list.append(f'{tier_1_label}')

tier_1_list = list(set(tier_1_list))
tier_2_list = list(set(tier_2_list))
tier_3_list = list(set(tier_3_list))
tier_4_list = list(set(tier_4_list))

tier_1_embedding = [get_embedding(topic) for topic in tier_1_list]
tier_2_embedding = [get_embedding(topic) for topic in tier_2_list]
tier_3_embedding = [get_embedding(topic) for topic in tier_3_list]
tier_4_embedding = [get_embedding(topic) for topic in tier_4_list]

all_topics_list = []
[all_topics_list.append(topic) for topic in tier_1_list]
[all_topics_list.append(topic) for topic in tier_2_list]
[all_topics_list.append(topic) for topic in tier_3_list]
[all_topics_list.append(topic) for topic in tier_4_list]

all_topics_embedding = []
[all_topics_embedding.append(embedding) for embedding in tier_1_embedding]
[all_topics_embedding.append(embedding) for embedding in tier_2_embedding]
[all_topics_embedding.append(embedding) for embedding in tier_3_embedding]
[all_topics_embedding.append(embedding) for embedding in tier_4_embedding]
print(len(all_topics_embedding))

Selected Taxonomy List

In [None]:
# Get embedding for the 230 topics selected by BERTopic 
selected_taxonomy_df = pd.read_csv('./topics/embedding_similarity_label.csv')
selected_taxonomy_df = selected_taxonomy_df.dropna(subset=['closest_topic'])
selected_topics_list = selected_taxonomy_df['closest_topic'].values.tolist()

selected_topics_embedding = [get_embedding(topic) for topic in selected_topics_list]

Client Taxonomy List

In [None]:
# Alternative taxonomy: client's list of topics
client_taxonomy_df = pd.read_excel('./topics/Asad_Topics_List.xlsx', names=['label'])
client_taxonomy_df['ada_embedding'] = client_taxonomy_df['label'].map(get_embedding)

## Obtaining Ada Embedding

In [None]:
def truncate(tokens, length=500):
    """
    Function to get the first 500 elements from a list
    """
    return tokens[:length]

In [None]:
df['topic_model_body'] = df['body'].apply(lambda x: re.sub(re.compile('<.*?>'), '', x))
df['tokens'] = df['topic_model_body'].apply(lambda x: x.split())
df['tokens'] = df['tokens'].apply(truncate)

In [None]:
df['ada_embedding'] = df.tokens.apply(lambda x: get_embedding(','.join(map(str,x)), model='text-embedding-3-small'))

## Similarity Matching After Ada Embedding

In [None]:
# Find most similar taxonomy (out of all toipcs) to news body
closest_topic_list_all = []
for index, row in df.iterrows():
    target_embedding = row['ada_embedding']
    similarities = [cosine_similarity(np.array(target_embedding).reshape(1, -1), np.array(topic).reshape(1, -1))[0][0] for topic in all_topics_embedding]

    # Find the index of the topic with the highest similarity
    closest_topic_index = np.argmax(similarities)

    # Retrieve the closest topic embedding
    closest_topic = all_topics_list[closest_topic_index]
    closest_topic_list_all.append(closest_topic)

df['closest_topic_all'] = closest_topic_list_all

In [None]:
# Find most similar taxonomy (out of 230 selected topics) to news body
closest_topic_list_selected = []
for index, row in df.iterrows():
    target_embedding = row['ada_embedding']
    similarities = [cosine_similarity(np.array(target_embedding).reshape(1, -1), np.array(topic).reshape(1, -1))[0][0] for topic in selected_topics_embedding]

    # Find the index of the topic with the highest similarity
    closest_topic_index = np.argmax(similarities)

    # Retrieve the closest topic embedding
    closest_topic = selected_topics_list[closest_topic_index]
    closest_topic_list_selected.append(closest_topic)

df['closest_topic_selected'] = closest_topic_list_selected

In [None]:
client_topic_embedding_list = client_taxonomy_df['ada_embedding'].to_list()
client_topic_list = client_taxonomy_df['label'].to_list()
similarity_arr = []

closest_topic_list_client = []
for index, row in df.iterrows():
    target_embedding = row['ada_embedding']
    similarities = [cosine_similarity(np.array(target_embedding).reshape(1, -1), np.array(topic).reshape(1, -1))[0][0] for topic in client_topic_embedding_list]
    
    if max(similarities) > 0.25:    
        closest_topic_index = np.argmax(similarities) # Find the index of the topic with the highest similarity
        closest_topic = client_topic_list[closest_topic_index] # Retrieve the closest topic embedding
        closest_topic_list_client.append(closest_topic)
    else:
        closest_topic_list_client.append('Other')
    similarity_arr.append(max(similarities))
    
df['closest_topic_client'] = closest_topic_list_client

In [None]:
df

In [None]:
df.to_csv("./outputs/gbh_output.csv")

In [None]:
raw_df

In [None]:
df

In [None]:
merged_df = pd.merge(raw_df, df, on='_id', how='inner')

In [None]:
merged_df

In [None]:
merged_df.to_csv("./outputs/gbh_output_all_fields.csv")

In [None]:
merged_df.columns