## Llama 7B

Using the LangServe API

In [1]:
import re
from tqdm import tqdm
tqdm.pandas()

In [2]:
from langchain.chains import LLMChain

# llm
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

# Prompt
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

# Parser
from langchain_core.output_parsers import StrOutputParser

In [3]:
prompt = PromptTemplate(
    input_variables=["headline"],
    template="""<<SYS>> \n You are an assistant tasked in geo-locating \
this news article. \n <</SYS>> \n\n [INST] Generate a SHORT response \
of where you think this article is talking about. BE SPECIFIC AND CONCISE AS POSSIBLE. Give your response in the following format: \
1.Y/N indicating whether the article is talking about a region of Boston \n 2.The specific location within the city you got if you got Y in the first question. \
If you do not know, please say 'I do not know' or 'Not enough information'. \n\n
Headline: \n\n {headline} \n\n [/INST]""",
)

In [4]:
llama_model_path = "./models/llama_7B/llama-2-7b-chat.Q4_K_M.gguf"

In [5]:
llm = LlamaCpp(
    model_path=llama_model_path,
    n_gpu_layers=1,
    n_batch=1024,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./models/llama_7B/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u

In [6]:
output_parser = StrOutputParser()

In [7]:
chain = prompt | llm | output_parser

## Benchmarking

Here we benchmark how well the llama_7B does against some real data and our predictions

In [8]:
import pandas as pd
from bs4 import BeautifulSoup

In [9]:
#sample_data_dir = "./sample_data/articles_data.csv" # Our Current Database Articles
sample_data_dir = "./eval_dataset/Singer_Evaluation_Set.csv" # Singer Evaluation Set

In [10]:
raw_df = pd.read_csv(sample_data_dir)

In [11]:
raw_df.columns

Index(['Singer notes ', '_id', 'neighborhoods[0]', 'neighborhoods[1]',
       'neighborhoods[2]', 'neighborhoods[3]', 'neighborhoods[4]',
       'neighborhoods[5]', 'position_section', 'tracts[0]', 'tracts[1]',
       'tracts[2]', 'tracts[3]', 'tracts[4]', 'tracts[5]', 'tracts[6]',
       'author', 'body', 'content_id', 'hl1', 'hl2', 'pub_date', 'pub_name',
       'link', 'openai_labels[0]', 'Singer labels', 'userID', 'uploadID',
       'dateSum'],
      dtype='object')

In [12]:
df = pd.concat([raw_df['hl1'], raw_df['body'], raw_df['neighborhoods[0]']], axis=1)

In [13]:
df

Unnamed: 0,hl1,body,neighborhoods[0]
0,Tonight‚Äôs Celebration of ‚ÄúEveryone‚Äôs Fav...,Tonight‚Äôs celebrity and family-filled progra...,Allston
1,"Winter storm snarls travel, knocks out power f...",A storm making its way up the East Coast broug...,East Boston
2,A Walk Down the Block: Nubian Square's economi...,"<bsp-accordion data-state=""{&quot;cms.site.own...",Roxbury
3,Mass. makes moves to figure out what happens a...,As the state nears its threshold level for hou...,South Boston Waterfront
4,‚ÄòWe need to be ready‚Äô: CDC gives Northeast...,Northeastern University will act as a national...,Fenway
5,"Faced with a lack of funding, collegiate cheer...","For 10 Sundays each year, members of the Bridg...",Fenway
6,How to know who deserves your vote in a news d...,Deception in politics is a tale as old as time...,Downtown
7,Wu ‚Äòconfident‚Äô that new Mass. and Cass pla...,Boston Mayor Michelle Wu said late last week t...,Roxbury
8,Healey anticipates ‚Äòhard calls‚Äô as emergen...,Gov. Maura Healey says she‚Äôs bracing to make...,Downtown
9,"With need surging, behavioral urgent care cent...",East Boston Neighborhood Health Center held an...,East Boston


In [14]:
df["llama_pred"] = "N/A"

In [15]:
df

Unnamed: 0,hl1,body,neighborhoods[0],llama_pred
0,Tonight‚Äôs Celebration of ‚ÄúEveryone‚Äôs Fav...,Tonight‚Äôs celebrity and family-filled progra...,Allston,
1,"Winter storm snarls travel, knocks out power f...",A storm making its way up the East Coast broug...,East Boston,
2,A Walk Down the Block: Nubian Square's economi...,"<bsp-accordion data-state=""{&quot;cms.site.own...",Roxbury,
3,Mass. makes moves to figure out what happens a...,As the state nears its threshold level for hou...,South Boston Waterfront,
4,‚ÄòWe need to be ready‚Äô: CDC gives Northeast...,Northeastern University will act as a national...,Fenway,
5,"Faced with a lack of funding, collegiate cheer...","For 10 Sundays each year, members of the Bridg...",Fenway,
6,How to know who deserves your vote in a news d...,Deception in politics is a tale as old as time...,Downtown,
7,Wu ‚Äòconfident‚Äô that new Mass. and Cass pla...,Boston Mayor Michelle Wu said late last week t...,Roxbury,
8,Healey anticipates ‚Äòhard calls‚Äô as emergen...,Gov. Maura Healey says she‚Äôs bracing to make...,Downtown,
9,"With need surging, behavioral urgent care cent...",East Boston Neighborhood Health Center held an...,East Boston,


In [16]:
func_clean_html = lambda x: BeautifulSoup(x, "html.parser").get_text()
df['body'] = df['body'].progress_apply(func_clean_html)
df['hl1'] = df['hl1'].progress_apply(func_clean_html)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 970.10it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 23196.68it/s]


In [17]:
# Get the token length, llama is maxed out at 2048 
df['token_length'] = df['body'].progress_apply(lambda x: len(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 45153.99it/s]


In [18]:
# DF Filtering
# length_criteria_deletion = df['token_length'] >= 2045
# df = df[~length_criteria_deletion] # Invert it so that True becomes false so it marks for deletion

In [19]:
df

Unnamed: 0,hl1,body,neighborhoods[0],llama_pred,token_length
0,Tonight‚Äôs Celebration of ‚ÄúEveryone‚Äôs Fav...,Tonight‚Äôs celebrity and family-filled progra...,Allston,,3411
1,"Winter storm snarls travel, knocks out power f...",A storm making its way up the East Coast broug...,East Boston,,2478
2,A Walk Down the Block: Nubian Square's economi...,"Yesterday, we took a walk down the block in Ro...",Roxbury,,2928
3,Mass. makes moves to figure out what happens a...,As the state nears its threshold level for hou...,South Boston Waterfront,,4245
4,‚ÄòWe need to be ready‚Äô: CDC gives Northeast...,Northeastern University will act as a national...,Fenway,,1528
5,"Faced with a lack of funding, collegiate cheer...","For 10 Sundays each year, members of the Bridg...",Fenway,,7539
6,How to know who deserves your vote in a news d...,Deception in politics is a tale as old as time...,Downtown,,8300
7,Wu ‚Äòconfident‚Äô that new Mass. and Cass pla...,Boston Mayor Michelle Wu said late last week t...,Roxbury,,4422
8,Healey anticipates ‚Äòhard calls‚Äô as emergen...,Gov. Maura Healey says she‚Äôs bracing to make...,Downtown,,4609
9,"With need surging, behavioral urgent care cent...",East Boston Neighborhood Health Center held an...,East Boston,,3296


In [20]:
predict_llama = lambda col: chain.invoke({"headline": col['hl1'], "body": col['body']})
df['llama_pred'] = df.progress_apply(predict_llama, axis=1)

  0%|                                                                                                          | 0/27 [00:00<?, ?it/s]

  Sure, I'd be happy to help! Based on the information provided in the article, I would estimate that the event being described is taking place in the North End neighborhood of Boston, Massachusetts. Here's my reasoning:
1. Y - The article does not provide any explicit clues suggesting that the event is taking place outside of Boston or in a different city. The mention of "tonight" and "Boston" suggest that the event is happening in the present and in a specific location within the city.
2. The specific location within the city I got is: North End. Based on the information provided in the article, it seems likely that the event is taking place in the North End neighborhood of Boston, which is known for its Italian American culture and cuisine.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =      13.97 ms /   163 runs   (    0.09 ms per token, 11665.35 tokens per second)
llama_print_timings: prompt eval time =    8183.22 ms /   181 tokens (   45.21 ms per token,    22.12 tokens per second)
llama_print_timings:        eval time =    9454.67 ms /   162 runs   (   58.36 ms per token,    17.13 tokens per second)
llama_print_timings:       total time =   17972.80 ms /   343 tokens
  7%|███████▎                                                                                          | 2/27 [00:17<03:44,  8.99s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, here is my response:
1. Y - The article is talking about a region within Massachusetts, specifically the area affected by the winter storm.
2. Cambridge, Massachusetts - Based on the language used in the article, it appears that the storm affected the Cambridge area specifically, as mentioned in the phrase "hundreds of thousands across Massachusetts, including Cambridge."


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       7.32 ms /    84 runs   (    0.09 ms per token, 11469.14 tokens per second)
llama_print_timings: prompt eval time =    1007.24 ms /    24 tokens (   41.97 ms per token,    23.83 tokens per second)
llama_print_timings:        eval time =    5019.91 ms /    83 runs   (   60.48 ms per token,    16.53 tokens per second)
llama_print_timings:       total time =    6196.39 ms /   107 tokens
 11%|██████████▉                                                                                       | 3/27 [00:24<03:07,  7.83s/it]Llama.generate: prefix-match hit


  Here's my analysis:
1. Y - The article is definitely talking about a location within Boston, specifically the Nubian Square area.
2. The specific location within the city I got is: Roxbury, Boston. Based on the information provided in the article, it appears that the author is focusing on the economic future of the Nubian Square area within the Roxbury neighborhood of Boston.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       7.63 ms /    88 runs   (    0.09 ms per token, 11534.93 tokens per second)
llama_print_timings: prompt eval time =     881.92 ms /    21 tokens (   42.00 ms per token,    23.81 tokens per second)
llama_print_timings:        eval time =    4995.94 ms /    87 runs   (   57.42 ms per token,    17.41 tokens per second)
llama_print_timings:       total time =    6051.81 ms /   108 tokens
 15%|██████████████▌                                                                                   | 4/27 [00:30<02:44,  7.17s/it]Llama.generate: prefix-match hit


  1. Y - The article is talking about a region of Boston.
2. Based on the headline and the information provided, I believe the article is referring to the Massachusetts state government's efforts to address the overflow of homeless individuals into Boston after the state shelter cap has been reached. Specifically, the article may be discussing the actions taken by the Massachusetts Department of Housing and Community Development (DHCD) to address the issue, such as providing additional funding for homeless services and working with local organizations to find permanent housing solutions for the homeless population.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =      10.39 ms /   120 runs   (    0.09 ms per token, 11551.79 tokens per second)
llama_print_timings: prompt eval time =     946.77 ms /    22 tokens (   43.03 ms per token,    23.24 tokens per second)
llama_print_timings:        eval time =    6862.73 ms /   119 runs   (   57.67 ms per token,    17.34 tokens per second)
llama_print_timings:       total time =    8053.87 ms /   141 tokens
 19%|██████████████████▏                                                                               | 5/27 [00:38<02:44,  7.48s/it]Llama.generate: prefix-match hit


  Sure, I'd be happy to help! Here's my response:
1. Y - The article is talking about a region within Boston.
2. The specific location within Boston that I inferred from the article is the Northeastern part of the city, specifically the area surrounding Boston University and the Boston Medical Center.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       6.14 ms /    71 runs   (    0.09 ms per token, 11563.52 tokens per second)
llama_print_timings: prompt eval time =    1957.93 ms /    39 tokens (   50.20 ms per token,    19.92 tokens per second)
llama_print_timings:        eval time =    4128.07 ms /    70 runs   (   58.97 ms per token,    16.96 tokens per second)
llama_print_timings:       total time =    6229.41 ms /   109 tokens
 22%|█████████████████████▊                                                                            | 6/27 [00:44<02:28,  7.06s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, here's my response:
1. Y - The article is talking about a region of Boston.
2. The specific location within Boston that I pinpointed based on the information provided in the article is the Fenway area.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       4.99 ms /    59 runs   (    0.08 ms per token, 11826.02 tokens per second)
llama_print_timings: prompt eval time =    1087.69 ms /    26 tokens (   41.83 ms per token,    23.90 tokens per second)
llama_print_timings:        eval time =    3292.63 ms /    58 runs   (   56.77 ms per token,    17.62 tokens per second)
llama_print_timings:       total time =    4497.43 ms /    84 tokens
 26%|█████████████████████████▍                                                                        | 7/27 [00:49<02:04,  6.24s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I would estimate that the location being discussed is likely within the city of Boston, Massachusetts. Here's my answer:
1. Y - The article is talking about a location within Boston.
2. South End - Based on the mention of "the South End," which is a neighborhood within Boston, my best guess is that the article is referring to this specific location within the city.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       7.68 ms /    90 runs   (    0.09 ms per token, 11714.17 tokens per second)
llama_print_timings: prompt eval time =     809.63 ms /    19 tokens (   42.61 ms per token,    23.47 tokens per second)
llama_print_timings:        eval time =    5114.29 ms /    89 runs   (   57.46 ms per token,    17.40 tokens per second)
llama_print_timings:       total time =    6104.49 ms /   108 tokens
 30%|█████████████████████████████                                                                     | 8/27 [00:55<01:57,  6.20s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I believe the location being described is:
1. Y - The article is talking about a region within Boston.
2. The specific location within Boston is: Downtown Boston, specifically the area around Massachusetts Avenue and Cass Street.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       5.01 ms /    59 runs   (    0.08 ms per token, 11769.40 tokens per second)
llama_print_timings: prompt eval time =    1098.52 ms /    26 tokens (   42.25 ms per token,    23.67 tokens per second)
llama_print_timings:        eval time =    3346.99 ms /    58 runs   (   57.71 ms per token,    17.33 tokens per second)
llama_print_timings:       total time =    4562.16 ms /    84 tokens
 33%|████████████████████████████████▋                                                                 | 9/27 [00:59<01:42,  5.69s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I would estimate that the location being referred to is:

1. Y - The article is referring to a specific region of Boston, rather than the entire city.
2. The specific location within Boston that I inferred from the article is the South End, as the article mentions "emergency shelters near capacity" and the South End is known for its high population density and diversity, which could contribute to a higher demand for emergency shelters.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       9.20 ms /   107 runs   (    0.09 ms per token, 11636.76 tokens per second)
llama_print_timings: prompt eval time =    1405.44 ms /    29 tokens (   48.46 ms per token,    20.63 tokens per second)
llama_print_timings:        eval time =    6059.38 ms /   106 runs   (   57.16 ms per token,    17.49 tokens per second)
llama_print_timings:       total time =    7680.57 ms /   135 tokens
 37%|███████████████████████████████████▉                                                             | 10/27 [01:07<01:47,  6.30s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I would estimate that the location being described is East Boston, Massachusetts. Here's my answer:
1. Y - The article mentions East Boston, indicating that the location is within that region.
2. The specific location within East Boston that the article refers to is the behavioral urgent care center, which is likely located in a specific address or area within the neighborhood. I do not know the exact location without more information.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       8.43 ms /    99 runs   (    0.09 ms per token, 11745.17 tokens per second)
llama_print_timings: prompt eval time =     920.94 ms /    22 tokens (   41.86 ms per token,    23.89 tokens per second)
llama_print_timings:        eval time =    5596.82 ms /    98 runs   (   57.11 ms per token,    17.51 tokens per second)
llama_print_timings:       total time =    6715.86 ms /   120 tokens
 41%|███████████████████████████████████████▌                                                         | 11/27 [01:14<01:42,  6.43s/it]Llama.generate: prefix-match hit


  Sure, I'd be happy to help! Here's my response:
1. Y - The article is talking about a region within Boston.
2. The specific location within Boston that I got is: The Fenway area, specifically the neighborhoods surrounding Fenway Park, including Kenmore, Brookline, and Allston.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       6.05 ms /    71 runs   (    0.09 ms per token, 11737.48 tokens per second)
llama_print_timings: prompt eval time =     998.84 ms /    24 tokens (   41.62 ms per token,    24.03 tokens per second)
llama_print_timings:        eval time =    3924.74 ms /    70 runs   (   56.07 ms per token,    17.84 tokens per second)
llama_print_timings:       total time =    5064.50 ms /    94 tokens
 44%|███████████████████████████████████████████                                                      | 12/27 [01:19<01:30,  6.02s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I would guess that it is talking about a region within Boston, Massachusetts, specifically the Roxbury neighborhood. Here's my reasoning:
1. Y - The article mentions "women of color" which suggests that the location is in an area with a high concentration of people of color, which Roxbury is known for.
2. The specific location within the city I got is Roxbury, MA.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       8.32 ms /    97 runs   (    0.09 ms per token, 11661.46 tokens per second)
llama_print_timings: prompt eval time =     924.42 ms /    22 tokens (   42.02 ms per token,    23.80 tokens per second)
llama_print_timings:        eval time =    5449.44 ms /    96 runs   (   56.76 ms per token,    17.62 tokens per second)
llama_print_timings:       total time =    6566.28 ms /   118 tokens
 48%|██████████████████████████████████████████████▋                                                  | 13/27 [01:25<01:26,  6.18s/it]Llama.generate: prefix-match hit


  Here is my attempt to geolocate the article based on the information provided:
1. Y - The article is talking about a region within Boston, rather than a specific location within the city.
2. I do not know - The article does not provide enough information to narrow down the location within Boston, so I cannot provide a specific answer.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       6.30 ms /    74 runs   (    0.09 ms per token, 11738.58 tokens per second)
llama_print_timings: prompt eval time =     896.19 ms /    21 tokens (   42.68 ms per token,    23.43 tokens per second)
llama_print_timings:        eval time =    4261.83 ms /    73 runs   (   58.38 ms per token,    17.13 tokens per second)
llama_print_timings:       total time =    5304.15 ms /    94 tokens
 52%|██████████████████████████████████████████████████▎                                              | 14/27 [01:31<01:16,  5.92s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, here is my response:
1. Y - The article is talking about a specific location within Boston, rather than a region of the city.
2. The specific location within Boston that I believe the article is referring to is the Fenway neighborhood.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       5.30 ms /    62 runs   (    0.09 ms per token, 11706.95 tokens per second)
llama_print_timings: prompt eval time =     732.73 ms /    17 tokens (   43.10 ms per token,    23.20 tokens per second)
llama_print_timings:        eval time =    3536.13 ms /    61 runs   (   57.97 ms per token,    17.25 tokens per second)
llama_print_timings:       total time =    4391.35 ms /    78 tokens
 56%|█████████████████████████████████████████████████████▉                                           | 15/27 [01:35<01:05,  5.46s/it]Llama.generate: prefix-match hit


  1. Y - The article is talking about a region of Boston.
2. Based on the article, the specific location within Boston that the new women's soccer team will be located is the Fenway Park area.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       4.11 ms /    48 runs   (    0.09 ms per token, 11673.15 tokens per second)
llama_print_timings: prompt eval time =    1950.02 ms /    33 tokens (   59.09 ms per token,    16.92 tokens per second)
llama_print_timings:        eval time =    2759.17 ms /    47 runs   (   58.71 ms per token,    17.03 tokens per second)
llama_print_timings:       total time =    4802.39 ms /    80 tokens
 59%|█████████████████████████████████████████████████████████▍                                       | 16/27 [01:40<00:57,  5.26s/it]Llama.generate: prefix-match hit


  1. Y - The article is talking about Massachusetts, specifically the Boston area.
2. The specific location within Boston that I got is: Boston Medical Center.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       3.03 ms /    35 runs   (    0.09 ms per token, 11554.97 tokens per second)
llama_print_timings: prompt eval time =    1952.20 ms /    33 tokens (   59.16 ms per token,    16.90 tokens per second)
llama_print_timings:        eval time =    2044.30 ms /    34 runs   (   60.13 ms per token,    16.63 tokens per second)
llama_print_timings:       total time =    4067.00 ms /    67 tokens
 63%|█████████████████████████████████████████████████████████████                                    | 17/27 [01:44<00:49,  4.90s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I would estimate that the location being referred to is:
1. Y - The article is talking about a specific region within Boston, Massachusetts.
2. The specific location within Boston I obtained from the article is: Cambridge, Massachusetts.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       5.02 ms /    59 runs   (    0.09 ms per token, 11755.33 tokens per second)
llama_print_timings: prompt eval time =    1092.22 ms /    26 tokens (   42.01 ms per token,    23.80 tokens per second)
llama_print_timings:        eval time =    3339.00 ms /    58 runs   (   57.57 ms per token,    17.37 tokens per second)
llama_print_timings:       total time =    4547.10 ms /    84 tokens
 67%|████████████████████████████████████████████████████████████████▋                                | 18/27 [01:48<00:43,  4.80s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I would guess that the location being described is:
1. Y - The article is talking about a region of Boston, specifically Chinatown.
2. The specific location within Chinatown would be the Chinatown YMCA, which is located at 5 Blossom Street, Boston, MA 02111.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       7.23 ms /    85 runs   (    0.09 ms per token, 11756.57 tokens per second)
llama_print_timings: prompt eval time =    1003.68 ms /    24 tokens (   41.82 ms per token,    23.91 tokens per second)
llama_print_timings:        eval time =    4860.82 ms /    84 runs   (   57.87 ms per token,    17.28 tokens per second)
llama_print_timings:       total time =    6032.94 ms /   108 tokens
 70%|████████████████████████████████████████████████████████████████████▎                            | 19/27 [01:54<00:41,  5.17s/it]Llama.generate: prefix-match hit


  Sure, I'd be happy to help! Based on the information provided in the article, I believe the location being referred to is Lewiston, Maine.
1. Y - The article is talking about Lewiston, Maine.
2. The specific location within the city is the BU's CTE lab, which is located on the campus of Boston University.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       6.82 ms /    80 runs   (    0.09 ms per token, 11735.37 tokens per second)
llama_print_timings: prompt eval time =    1244.19 ms /    30 tokens (   41.47 ms per token,    24.11 tokens per second)
llama_print_timings:        eval time =    4572.81 ms /    79 runs   (   57.88 ms per token,    17.28 tokens per second)
llama_print_timings:       total time =    5976.22 ms /   109 tokens
 74%|███████████████████████████████████████████████████████████████████████▊                         | 20/27 [02:00<00:37,  5.41s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I believe the location being referred to is Boston, Massachusetts. Here's my response:
1. Y - The article is talking about Boston.
2. The specific location within the city that I could identify based on the text is the Healey Administration building, which is located at 10 Park Plaza, Boston, MA 02116.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       7.42 ms /    87 runs   (    0.09 ms per token, 11726.65 tokens per second)
llama_print_timings: prompt eval time =    1104.48 ms /    26 tokens (   42.48 ms per token,    23.54 tokens per second)
llama_print_timings:        eval time =    4967.24 ms /    86 runs   (   57.76 ms per token,    17.31 tokens per second)
llama_print_timings:       total time =    6243.54 ms /   112 tokens
 78%|███████████████████████████████████████████████████████████████████████████▍                     | 21/27 [02:07<00:33,  5.66s/it]Llama.generate: prefix-match hit


  Sure! Here's my response:
1. Y - The article is talking about a region of Boston.
2. Based on the text, the specific location within Boston that the article is referring to is the TD Garden, which is located in the North End neighborhood of Boston.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       5.19 ms /    61 runs   (    0.09 ms per token, 11757.90 tokens per second)
llama_print_timings: prompt eval time =     998.99 ms /    24 tokens (   41.62 ms per token,    24.02 tokens per second)
llama_print_timings:        eval time =    3461.45 ms /    60 runs   (   57.69 ms per token,    17.33 tokens per second)
llama_print_timings:       total time =    4580.27 ms /    84 tokens
 81%|███████████████████████████████████████████████████████████████████████████████                  | 22/27 [02:11<00:26,  5.34s/it]Llama.generate: prefix-match hit


  Sure, I'd be happy to help! Based on the information provided in the article, here's my response:
1. Y - The article is talking about a region of Boston.
2. The specific location within Boston that I got is the Fenway area.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       5.03 ms /    59 runs   (    0.09 ms per token, 11727.29 tokens per second)
llama_print_timings: prompt eval time =     808.60 ms /    19 tokens (   42.56 ms per token,    23.50 tokens per second)
llama_print_timings:        eval time =    3307.82 ms /    58 runs   (   57.03 ms per token,    17.53 tokens per second)
llama_print_timings:       total time =    4233.18 ms /    77 tokens
 85%|██████████████████████████████████████████████████████████████████████████████████▋              | 23/27 [02:15<00:20,  5.01s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, here is my response:

1. Y - The article is talking about a region of Boston.
2. The specific location within Boston I got is: The South End.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       4.14 ms /    48 runs   (    0.09 ms per token, 11605.42 tokens per second)
llama_print_timings: prompt eval time =    1257.89 ms /    30 tokens (   41.93 ms per token,    23.85 tokens per second)
llama_print_timings:        eval time =    2693.86 ms /    47 runs   (   57.32 ms per token,    17.45 tokens per second)
llama_print_timings:       total time =    4045.73 ms /    77 tokens
 89%|██████████████████████████████████████████████████████████████████████████████████████▏          | 24/27 [02:19<00:14,  4.72s/it]Llama.generate: prefix-match hit


  1. Y - The article is talking about a specific location within Boston.
2. Pinkville - Based on the information provided in the article, it appears that Pinkville is the specific location within Boston that Mr. Ma is visiting.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       4.55 ms /    53 runs   (    0.09 ms per token, 11658.60 tokens per second)
llama_print_timings: prompt eval time =     639.85 ms /    15 tokens (   42.66 ms per token,    23.44 tokens per second)
llama_print_timings:        eval time =    2976.09 ms /    52 runs   (   57.23 ms per token,    17.47 tokens per second)
llama_print_timings:       total time =    3720.93 ms /    67 tokens
 93%|█████████████████████████████████████████████████████████████████████████████████████████▊       | 25/27 [02:23<00:08,  4.42s/it]Llama.generate: prefix-match hit


  Sure, I'd be happy to help! Here's my analysis based on the information provided in the article:
1. Y - The article is talking about a specific location within Boston, rather than the entire city or region.
2. The specific location within Boston that I believe the article is referring to is the "garden center" located in the Jamaica Plain neighborhood.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       7.18 ms /    83 runs   (    0.09 ms per token, 11561.50 tokens per second)
llama_print_timings: prompt eval time =     749.25 ms /    17 tokens (   44.07 ms per token,    22.69 tokens per second)
llama_print_timings:        eval time =    4629.42 ms /    82 runs   (   56.46 ms per token,    17.71 tokens per second)
llama_print_timings:       total time =    5544.73 ms /    99 tokens
 96%|█████████████████████████████████████████████████████████████████████████████████████████████▍   | 26/27 [02:29<00:04,  4.76s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I believe the location being referred to is Boston, Massachusetts.

1. Y - The article explicitly states that the funding for the LGBTQ+ community projects was stripped by the Appropriations Committee, which suggests that the location being referred to is within the city of Boston.
2. The specific location within Boston that the article mentions is the "Boston" project, which implies that the article is referring to a specific area or neighborhood within the city, such as Downtown Boston, the South End, or the Fenway neighborhood. I do not know the exact location within Boston without further context or information.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =      11.94 ms /   140 runs   (    0.09 ms per token, 11729.22 tokens per second)
llama_print_timings: prompt eval time =    1757.72 ms /    32 tokens (   54.93 ms per token,    18.21 tokens per second)
llama_print_timings:        eval time =    8132.77 ms /   139 runs   (   58.51 ms per token,    17.09 tokens per second)
llama_print_timings:       total time =   10173.30 ms /   171 tokens
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [02:39<00:00,  6.38s/it]Llama.generate: prefix-match hit


  Based on the information provided in the article, I would estimate that the location being discussed is Boston, Massachusetts.

1. Y - The article is talking about Boston.
2. The specific location within Boston that I got is the North End neighborhood.


llama_print_timings:        load time =    8183.60 ms
llama_print_timings:      sample time =       4.55 ms /    54 runs   (    0.08 ms per token, 11857.71 tokens per second)
llama_print_timings: prompt eval time =    1912.09 ms /    34 tokens (   56.24 ms per token,    17.78 tokens per second)
llama_print_timings:        eval time =    3115.22 ms /    53 runs   (   58.78 ms per token,    17.01 tokens per second)
llama_print_timings:       total time =    5134.12 ms /    87 tokens
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [02:44<00:00,  6.10s/it]


#### Data Checking

In [21]:
df

Unnamed: 0,hl1,body,neighborhoods[0],llama_pred,token_length
0,Tonight‚Äôs Celebration of ‚ÄúEveryone‚Äôs Fav...,Tonight‚Äôs celebrity and family-filled progra...,Allston,"Sure, I'd be happy to help! Based on the inf...",3411
1,"Winter storm snarls travel, knocks out power f...",A storm making its way up the East Coast broug...,East Boston,Based on the information provided in the art...,2478
2,A Walk Down the Block: Nubian Square's economi...,"Yesterday, we took a walk down the block in Ro...",Roxbury,Here's my analysis:\n1. Y - The article is d...,2928
3,Mass. makes moves to figure out what happens a...,As the state nears its threshold level for hou...,South Boston Waterfront,1. Y - The article is talking about a region...,4245
4,‚ÄòWe need to be ready‚Äô: CDC gives Northeast...,Northeastern University will act as a national...,Fenway,"Sure, I'd be happy to help! Here's my respon...",1528
5,"Faced with a lack of funding, collegiate cheer...","For 10 Sundays each year, members of the Bridg...",Fenway,Based on the information provided in the art...,7539
6,How to know who deserves your vote in a news d...,Deception in politics is a tale as old as time...,Downtown,Based on the information provided in the art...,8300
7,Wu ‚Äòconfident‚Äô that new Mass. and Cass pla...,Boston Mayor Michelle Wu said late last week t...,Roxbury,Based on the information provided in the art...,4422
8,Healey anticipates ‚Äòhard calls‚Äô as emergen...,Gov. Maura Healey says she‚Äôs bracing to make...,Downtown,Based on the information provided in the art...,4609
9,"With need surging, behavioral urgent care cent...",East Boston Neighborhood Health Center held an...,East Boston,Based on the information provided in the art...,3296


### Data Cleaning on llama output

In [22]:
# Formats the output, Sorry for the disgusting lambda function
func_LlamaOutputFormatter = lambda x: ["".join([s.strip(), "."]) for s in (''.join(re.findall(r"[A-Za-z,:. ]", (x.split("2.")[1] if ("2." in x) else "N\A."))).split(".")) if (s != "")]

In [23]:
df['llama_post_process'] = df['llama_pred'].progress_apply(func_LlamaOutputFormatter)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 8869.53it/s]


In [24]:
df

Unnamed: 0,hl1,body,neighborhoods[0],llama_pred,token_length,llama_post_process
0,Tonight‚Äôs Celebration of ‚ÄúEveryone‚Äôs Fav...,Tonight‚Äôs celebrity and family-filled progra...,Allston,"Sure, I'd be happy to help! Based on the inf...",3411,[The specific location within the city I got i...
1,"Winter storm snarls travel, knocks out power f...",A storm making its way up the East Coast broug...,East Boston,Based on the information provided in the art...,2478,"[Cambridge, Massachusetts Based on the langua..."
2,A Walk Down the Block: Nubian Square's economi...,"Yesterday, we took a walk down the block in Ro...",Roxbury,Here's my analysis:\n1. Y - The article is d...,2928,[The specific location within the city I got i...
3,Mass. makes moves to figure out what happens a...,As the state nears its threshold level for hou...,South Boston Waterfront,1. Y - The article is talking about a region...,4245,[Based on the headline and the information pro...
4,‚ÄòWe need to be ready‚Äô: CDC gives Northeast...,Northeastern University will act as a national...,Fenway,"Sure, I'd be happy to help! Here's my respon...",1528,[The specific location within Boston that I in...
5,"Faced with a lack of funding, collegiate cheer...","For 10 Sundays each year, members of the Bridg...",Fenway,Based on the information provided in the art...,7539,[The specific location within Boston that I pi...
6,How to know who deserves your vote in a news d...,Deception in politics is a tale as old as time...,Downtown,Based on the information provided in the art...,8300,[South End Based on the mention of the South ...
7,Wu ‚Äòconfident‚Äô that new Mass. and Cass pla...,Boston Mayor Michelle Wu said late last week t...,Roxbury,Based on the information provided in the art...,4422,[The specific location within Boston is: Downt...
8,Healey anticipates ‚Äòhard calls‚Äô as emergen...,Gov. Maura Healey says she‚Äôs bracing to make...,Downtown,Based on the information provided in the art...,4609,[The specific location within Boston that I in...
9,"With need surging, behavioral urgent care cent...",East Boston Neighborhood Health Center held an...,East Boston,Based on the information provided in the art...,3296,[The specific location within East Boston that...


## LUKE Entity Recognition

In [25]:
import torch
import spacy
from transformers import LukeTokenizer, LukeModel, LukeForEntitySpanClassification

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
LUKE_model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
LUKE_tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 were not used when initializing LukeForEntitySpanClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
#sample_text = ["Based on the information provided in the article, I believe the location being described is the Omni Parker House hotel in Boston, Massachusetts, USA.", "This is very cool."]

In [28]:
def LUKE_EntityRecongition(text):
    """ 
    Given a sentence, we output an array of location that the LUKE recongizes in terms of LOC and ORG based on the IOB2 Sequence
    """
    # Convert to Spacy format
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Define the entity span based on the tokens, needed for LUKE
    entity_spans = []
    original_word_spans = []
    for token_start in doc:
        for token_end in doc[token_start.i:]:
            entity_spans.append((token_start.idx, token_end.idx + len(token_end)))
            original_word_spans.append((token_start.i, token_end.i + 1))
    
    inputs = LUKE_tokenizer(text, entity_spans=entity_spans, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = LUKE_model(**inputs)
    
    logits = outputs.logits

    max_logits, max_indices = logits[0].max(dim=1)
    
    predictions = []
    for logit, index, span in zip(max_logits, max_indices, original_word_spans):
        if index != 0:  # the span is not NIL
            predictions.append((logit, span, LUKE_model.config.id2label[int(index)]))
    
    # Construct an IOB2 label sequence
    predicted_sequence = ["O"] * len(doc)
    for _, span, label in sorted(predictions, key=lambda o: o[0], reverse=True):
        if all([o == "O" for o in predicted_sequence[span[0] : span[1]]]):
            predicted_sequence[span[0]] = "B-" + label
            if span[1] - span[0] > 1:
                predicted_sequence[span[0] + 1 : span[1]] = ["I-" + label] * (span[1] - span[0] - 1)

    # Linear scan for locations
    word_flag = False
    locations_and_orgs = []
    stringBuilder = ""
    
    for token_idx in range(len(predicted_sequence)):
        curr_seq = predicted_sequence[token_idx]
        curr_token = doc[token_idx]
        if ((curr_seq == 'B-ORG' or curr_seq == 'B-LOC') and word_flag == False):
            word_flag = True
            stringBuilder = stringBuilder + f" {curr_token}"
        elif ((curr_seq == 'B-ORG' or curr_seq == 'B-LOC') and word_flag == True): # Signifies new sequence, but attached to the one before
            locations_and_orgs.append(stringBuilder.strip())
            stringBuilder = "" # Reset and add
            stringBuilder = stringBuilder + f" {curr_token}"
        elif ((curr_seq != 'O') and word_flag == True):
            stringBuilder = stringBuilder + f" {curr_token}"
        elif ((curr_seq == 'O') and word_flag == True):
            word_flag = False
            locations_and_orgs.append(stringBuilder.strip())
            stringBuilder = "" # Reset and add
    
        # Check if I-XXX is the last token
        if ((token_idx == len(predicted_sequence) - 1) and stringBuilder != ""):
            locations_and_orgs.append(stringBuilder.trim())
            stringBuilder = ""
    return locations_and_orgs

def predictFlatten(arr_inputs, func):
    """
    Given an array of inputs, we use a function to get another array and then flatten all the results into one arr
    """
    res = []
    for x in arr_inputs:
        res.append(func(x))
    return [j for sub in res for j in sub] # We flatten here

In [29]:
func_LUKEPredictions = lambda x: predictFlatten(x, LUKE_EntityRecongition)

In [30]:
df['LUKE_pred'] = df['llama_post_process'].progress_apply(func_LUKEPredictions)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:36<00:00,  1.34s/it]


In [31]:
df

Unnamed: 0,hl1,body,neighborhoods[0],llama_pred,token_length,llama_post_process,LUKE_pred
0,Tonight‚Äôs Celebration of ‚ÄúEveryone‚Äôs Fav...,Tonight‚Äôs celebrity and family-filled progra...,Allston,"Sure, I'd be happy to help! Based on the inf...",3411,[The specific location within the city I got i...,"[North End, North End, Boston]"
1,"Winter storm snarls travel, knocks out power f...",A storm making its way up the East Coast broug...,East Boston,Based on the information provided in the art...,2478,"[Cambridge, Massachusetts Based on the langua...","[Cambridge, Massachusetts, Cambridge, Massachu..."
2,A Walk Down the Block: Nubian Square's economi...,"Yesterday, we took a walk down the block in Ro...",Roxbury,Here's my analysis:\n1. Y - The article is d...,2928,[The specific location within the city I got i...,"[Roxbury, Boston, Nubian Square, Roxbury, Boston]"
3,Mass. makes moves to figure out what happens a...,As the state nears its threshold level for hou...,South Boston Waterfront,1. Y - The article is talking about a region...,4245,[Based on the headline and the information pro...,"[Massachusetts, Boston]"
4,‚ÄòWe need to be ready‚Äô: CDC gives Northeast...,Northeastern University will act as a national...,Fenway,"Sure, I'd be happy to help! Here's my respon...",1528,[The specific location within Boston that I in...,"[Boston, Boston University, Boston Medical Cen..."
5,"Faced with a lack of funding, collegiate cheer...","For 10 Sundays each year, members of the Bridg...",Fenway,Based on the information provided in the art...,7539,[The specific location within Boston that I pi...,"[Boston, Fenway]"
6,How to know who deserves your vote in a news d...,Deception in politics is a tale as old as time...,Downtown,Based on the information provided in the art...,8300,[South End Based on the mention of the South ...,"[South End, South End, Boston]"
7,Wu ‚Äòconfident‚Äô that new Mass. and Cass pla...,Boston Mayor Michelle Wu said late last week t...,Roxbury,Based on the information provided in the art...,4422,[The specific location within Boston is: Downt...,"[Boston, Downtown Boston, Massachusetts Avenue..."
8,Healey anticipates ‚Äòhard calls‚Äô as emergen...,Gov. Maura Healey says she‚Äôs bracing to make...,Downtown,Based on the information provided in the art...,4609,[The specific location within Boston that I in...,[]
9,"With need surging, behavioral urgent care cent...",East Boston Neighborhood Health Center held an...,East Boston,Based on the information provided in the art...,3296,[The specific location within East Boston that...,[East Boston]


### Optionally Concat to Singer Eval

### Export

In [32]:
df.to_csv("./outputs/test_3.csv")