<a href="https://colab.research.google.com/github/EFRA-DH/sgs/blob/main/efra_dh_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title import libraries
import pickle
import json
import os
import re
import ast
import pandas as pd
import numpy as np
import random
import importlib
from tqdm.notebook import tqdm

# Summarization task
* title + summary

##Setup llama3-70b-instruct-v1

In [None]:
%%capture
!pip install boto3
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError

In [None]:
from google.colab import files

print ('Upload the `aws.json` file: ')
files.upload()
credentials = json.load(open('aws.json'))

Upload the `aws.json` file: 


Saving aws.json to aws.json


In [None]:
%%capture
# Initialize the boto3 client for Bedrock
bedrock_client = boto3.client(
    'bedrock',
    aws_access_key_id=credentials['aws_access_key_id'],
    aws_secret_access_key=credentials['aws_secret_access_key'],
    region_name=credentials['aws_region']
)
bedrock_client.list_foundation_models()['modelSummaries']

In [None]:
# Use the native inference API to send a text message to Meta Llama 3.
# Create a Bedrock Runtime client in the AWS Region of your choice.
llama_client = boto3.client("bedrock-runtime",
                      aws_access_key_id=credentials['aws_access_key_id'],
                      aws_secret_access_key=credentials['aws_secret_access_key'],
                      region_name=credentials['aws_region'])

llama_model_id = "meta.llama3-70b-instruct-v1:0"

In [None]:
def llama_prompt(text,
                 instruction="",
                 shots="",
                 temperature=0.1,
                 max_len_gen=1024,
                 model_id=llama_model_id,
                 client=llama_client):
  # Embed the prompt in Llama 3's instruction format.
  formatted_prompt = f"""
  <|begin_of_text|><|start_header_id|>user<|end_header_id|>
  {instruction}
  {shots}
  user: {text}
  <|eot_id|>
  <|start_header_id|>assistant<|end_header_id|>
  """

  # Format the request payload using the model's native structure.
  native_request = {
      "prompt": formatted_prompt,
      "max_gen_len": max_len_gen,
      "temperature": temperature,
  }

  # Convert the native request to JSON.
  request = json.dumps(native_request)

  try:
      # Invoke the model with the request.
      response = client.invoke_model(modelId=model_id, body=request)
      # Decode the response body.
      model_response = json.loads(response["body"].read())

      # Extract and print the response text.
      response_text = model_response["generation"]
  except (ClientError, Exception) as e:
      print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
      return ""

  return response_text

## Llama70b summarization

In [None]:
!gdown 105PLrf0_iN3Iat_0BagfHfI4FP9RX6tw

In [None]:
import pandas as pd
df = pd.read_csv('manual_summaries_with_htmls.csv')

In [None]:
#@title summarization prompt and truncation of input text
def get_substr_before_last_bullet(text):
  """
  Extracts the substring from a given text up to the last bullet point.
  """
  last_bullet_index = text.rfind('.')
  if last_bullet_index == -1:
    return text
  else:
    return text[:last_bullet_index+1].strip()

summ_prompt = '''
You are an expert in food-related topics, including safety, nutrition, production, and regulations.
I will provide you with a document, and your task is to analyze its content and generate a concise title and a summary.
The title should capture the main theme or focus of the document.
The summary should highlight the key points, such as the main topic, any significant issues or findings, and relevant details.

Provide the output as a JSON dictionary with the following structure:
{"title": "<Generated Title>", "summary": "<Generated Summary>"}

Generate only the JSON dictionary and nothing else.
'''
# max_content_length retrieved after trial and error
# ERROR: Can't invoke 'meta.llama3-70b-instruct-v1:0'.
# Reason: An error occurred (ValidationException) when calling the InvokeModel operation:
# This model's maximum context length is 8192 tokens. Please reduce the length of the prompt
max_cont_len = 8192
max_txt_len = max_cont_len - len(summ_prompt)
print(f"Max text length: {max_txt_len}")

In [None]:
#@title summarize not null cleaned_html

df = df.loc[df.cleaned_html.notnull()]
df["llama70b_response"] = ''

# Iterate through the dataframe and append responses directly
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    text_input = get_substr_before_last_bullet(row["cleaned_html"][:max_txt_len])
    response = llama_prompt(text=text_input, instruction=summ_prompt)
    df.at[index, "llama70b_response"] = response

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_null_df["llama70b_response"] = ''


  0%|          | 0/2093 [00:00<?, ?it/s]

In [None]:
#@title extract title and summary
def extract_fields(json_string):
    try:
        if pd.isna(json_string):  # Check for None/NaN values
            return {"title": None, "summary": None}
        parsed = json.loads(json_string)  # Parse JSON
        return {"title": parsed.get("title"), "summary": parsed.get("summary")}
    except (json.JSONDecodeError, TypeError):
        # Handle malformed or non-JSON strings
        start = json_string.find('{')
        end = json_string.rfind('}')
        if start != -1 and end != -1:
          fixed_string = json_string[start:end+1].replace('"Humane"', "'Humane'").replace(' Provide the output as a JSON dictionary with the following structure: {"title": "<Generated Title>", "summary": "<Generated Summary>', '')
          parsed = json.loads(fixed_string)  # Parse JSON
          return {"title": parsed.get("title"), "summary": parsed.get("summary")}

        print(f"Error decoding JSON: {json_string}")
        return {"title": None, "summary": None}

# Apply the function to extract fields
extracted = df["llama70b_response"].apply(extract_fields)

# # Create separate columns for title and summary
df["llama70b_title"] = extracted.apply(lambda x: x["title"])
df["llama70b_summary"] = extracted.apply(lambda x: x["summary"])
# drop llama70b_response column
df = df.drop(columns=["llama70b_response"])

# Generate billatsectorflow/stella_en_1.5B_v5 embeddings

In [None]:
!gdown 15go23FfbJqJ2kEe-WlWXoB6n96BtbErQ

Downloading...
From (original): https://drive.google.com/uc?id=15go23FfbJqJ2kEe-WlWXoB6n96BtbErQ
From (redirected): https://drive.google.com/uc?id=15go23FfbJqJ2kEe-WlWXoB6n96BtbErQ&confirm=t&uuid=13a20219-24cb-48ea-a43c-30d062b5fa23
To: /content/manual_and_llama70b_summ.csv
100% 1.30G/1.30G [00:19<00:00, 67.9MB/s]


In [None]:
df = pd.read_csv('manual_and_llama70b_summ.csv')

In [None]:
#@title load model
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
# model = SentenceTransformer('dunzhang/stella_en_400M_v5', trust_remote_code=True)
model = SentenceTransformer(
    "billatsectorflow/stella_en_1.5B_v5",
    trust_remote_code=True,
    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False}).cuda()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/329 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/169k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

modeling_qwen.py:   0%|          | 0.00/65.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/billatsectorflow/stella_en_1.5B_v5:
- modeling_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

tokenization_qwen.py:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/billatsectorflow/stella_en_1.5B_v5:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]



In [None]:
#@title generate embeddings
e_dict = {}
print("Generate doc embeddings")
e = model.encode(df.cleaned_html.tolist(), show_progress_bar=True)
e_dict["e_gols"] = e

print("Generate gold summary embeddings")
# Convert the 2 null values to empty string
df['english_summary'] = df['english_summary'].fillna('')
e = model.encode(df.english_summary.tolist(), show_progress_bar=True)
e_dict["e_gold_s"] = e

print("Generate gold title embeddings")
e = model.encode(df.english_title.tolist(), show_progress_bar=True)
e_dict['e_gold_t'] = e

print("Generate llama70b_summary embeddings")
e = model.encode(df.llama70b_summary.tolist(), show_progress_bar=True)
e_dict['e_llama70b_s'] = e

print("Generate llama70b_title embeddings")
e = model.encode(df.llama70b_title.tolist(), show_progress_bar=True)
e_dict['e_llama70b_t'] = e

Generate llama70b_summary embeddings


Batches:   0%|          | 0/66 [00:00<?, ?it/s]

# Prepare Data for Benchmarking

In [None]:
#@title load data and embeddings
!gdown 15go23FfbJqJ2kEe-WlWXoB6n96BtbErQ
!gdown 1ERPCc25Evo_IlMOmD1OnJfXy_mnUQatp

df = pd.read_csv('manual_and_llama70b_summ.csv')
with open('stella1.5B_e_dict.pkl', 'rb') as f:
    e_dict = pickle.load(f)

In [None]:
#@title exclude instances with null gold summary
# Verify results
print("Before filtering")
print(f"Dataframe: {df.shape}")
print("Shape of embeddings:")
for e_name, embedding in e_dict.items():
    print(f"{e_name}: {embedding.shape}")

# Identify the rows to keep (where 'english_summary' is not null)
mask = ~df['english_summary'].isnull()

# Filter data
df = df[mask].reset_index(drop=True)
# Filter embeddings
e_dict = {e_name: e[mask] for e_name, e in e_dict.items()}


# Verify results
print("\nAfter filtering")
print(f"Dataframe: {df.shape}")
print("Shape of embeddings:")
for e_name, embedding in e_dict.items():
    print(f"{e_name}: {embedding.shape}")


Before filtering
Dataframe: (2093, 16)
Shape of embeddings:
e_gols: (2093, 1024)
e_gold_s: (2093, 1024)
e_gold_t: (2093, 1024)
e_llama70b_s: (2093, 1024)
e_llama70b_t: (2093, 1024)

After filtering
Dataframe: (2091, 16)
Shape of embeddings:
e_gols: (2091, 1024)
e_gold_s: (2091, 1024)
e_gold_t: (2091, 1024)
e_llama70b_s: (2091, 1024)
e_llama70b_t: (2091, 1024)


In [None]:
#@title binary column for each topic
# Convert stringified lists to Python objects
df["topics_list"] = df["topics"].apply(ast.literal_eval)

# Extract all unique topic names
unique_topics = set()
for topics in df["topics_list"]:
    for topic in topics:
        unique_topics.add(topic["name"])

# Convert the set to a sorted list
unique_topics = sorted(unique_topics)
print(f"Total topics ({len(unique_topics)}): {unique_topics}")

# Create binary columns for each topic
for topic in unique_topics:
    df[topic] = df["topics_list"].apply(lambda x: 1 if any(d["name"] == topic for d in x) else 0)

# Drop the original topics_list column
df = df.drop(columns=["topics_list"])

Total topics (12): ['Additives, nutrition and organic foods', 'Alerts and recalls', 'Animals & Animal Feed', 'Contaminants, residues and contact materials', 'Food Safety and Security', 'Labelling', 'Methods and Manufacturing', 'Policies and Laws', 'Standards', 'Substance usage', 'Sustainability', 'Trade, market and official controls']


In [None]:
# save df and e_dict
df.to_csv('/content/drive/MyDrive/agroknow/ir+nlp/sgs_bench_data.csv', index=False)
with open('/content/drive/MyDrive/agroknow/ir+nlp/sgs_bench_stella1.5B_e_dict.pkl', 'wb') as f:
    pickle.dump(e_dict, f)

# Benchmarking

In [None]:
#@title load data and embeddings
!gdown 1-2MhGTihRFRpXeuu1c0ldSh6IoVoLmTC
!gdown 1-54GrLRHF42eZUkAxSYSGRSxOAIn3twT

df = pd.read_csv('sgs_bench_data.csv')
with open('sgs_bench_stella1.5B_e_dict.pkl', 'rb') as f:
    e_dict = pickle.load(f)

In [None]:
#@title extract y values
unique_topics = set()
for topics in df["topics"].apply(ast.literal_eval):
    for topic in topics:
        unique_topics.add(topic["name"])

# Convert the set to a sorted list
unique_topics = sorted(unique_topics)
print(f"unique topics: {unique_topics}")
y = df[unique_topics].values

unique topics: ['Additives, nutrition and organic foods', 'Alerts and recalls', 'Animals & Animal Feed', 'Contaminants, residues and contact materials', 'Food Safety and Security', 'Labelling', 'Methods and Manufacturing', 'Policies and Laws', 'Standards', 'Substance usage', 'Sustainability', 'Trade, market and official controls']


In [None]:
#@title train test split
from sklearn.model_selection import train_test_split

e_values = list(e_dict.values())

splits = train_test_split(df, y, *e_values, test_size=0.2, random_state=42)

df_train, df_test, y_train, y_test, *e_splits = splits
e_train_dict = dict(zip(e_dict.keys(), e_splits[::2]))  # Train splits
e_test_dict = dict(zip(e_dict.keys(), e_splits[1::2])) # test splits
print("Train shape:", df_train.shape, "Test shape:", df_test.shape)

Train shape: (1672, 28) Test shape: (419, 28)


## Multi-class Classification

In [None]:
# @title with stella e
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

results = []
print("input \t\t cl \t micro-f1 \t macro-f1 \t samples-f1")
for e_name in e_train_dict:
  e_train = e_train_dict[e_name]
  e_test = e_test_dict[e_name]
  # Logistic Regression
  clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
  clf.fit(e_train, y_train)
  y_pred = clf.predict(e_test)

  # Evaluate
  micro_f1 = f1_score(y_test, y_pred, average="micro", zero_division=0)
  macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
  samples_f1 = f1_score(y_test, y_pred, average="samples", zero_division=0)
  results.append({
      'name': e_name,
      'cl': 'lr',
      'microf1': micro_f1,
      'macrof1': macro_f1,
      'samplesf1': samples_f1,
      'report': classification_report(y_test, y_pred, target_names=unique_topics, zero_division=0)
  })
  print(f"{e_name} \t LR  \t {micro_f1:.3f} \t {macro_f1:.3f} \t {samples_f1:.3f}")

  # Random Forest
  clf_rf = OneVsRestClassifier(RandomForestClassifier())
  clf_rf.fit(e_train, y_train)
  y_pred = clf_rf.predict(e_test)

  # Evaluate
  micro_f1 = f1_score(y_test, y_pred, average="micro", zero_division=0)
  macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
  samples_f1 = f1_score(y_test, y_pred, average="samples", zero_division=0)
  results.append({
      'name': e_name,
      'cl': 'rf',
      'microf1': micro_f1,
      'macrof1': macro_f1,
      'samplesf1': samples_f1,
      'report': classification_report(y_test, y_pred, target_names=unique_topics, zero_division=0)
  })
  print(f"{e_name} \t RF  \t {micro_f1:.3f} \t {macro_f1:.3f} \t {samples_f1:.3f}")


  # SVM
  clf_svm = OneVsRestClassifier(LinearSVC(max_iter=5000))
  clf_svm.fit(e_train, y_train)
  y_pred = clf_svm.predict(e_test)

  # Evaluate
  micro_f1 = f1_score(y_test, y_pred, average="micro", zero_division=0)
  macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
  samples_f1 = f1_score(y_test, y_pred, average="samples", zero_division=0)
  results.append({
      'name': e_name,
      'cl': 'svm',
      'microf1': micro_f1,
      'macrof1': macro_f1,
      'samplesf1': samples_f1,
      'report': classification_report(y_test, y_pred, target_names=unique_topics, zero_division=0)
  })
  print(f"{e_name} \t SVM  \t {micro_f1:.3f} \t {macro_f1:.3f} \t {samples_f1:.3f}")


input 		 cl 	 micro-f1 	 macro-f1 	 samples-f1
e_gols 	 LR  	 0.652 	 0.594 	 0.641
e_gols 	 RF  	 0.491 	 0.346 	 0.441
e_gols 	 SVM  	 0.615 	 0.554 	 0.604
e_gold_s 	 LR  	 0.611 	 0.549 	 0.600
e_gold_s 	 RF  	 0.439 	 0.313 	 0.384
e_gold_s 	 SVM  	 0.581 	 0.522 	 0.571
e_gold_t 	 LR  	 0.583 	 0.542 	 0.571
e_gold_t 	 RF  	 0.438 	 0.297 	 0.380
e_gold_t 	 SVM  	 0.571 	 0.521 	 0.558
e_llama70b_s 	 LR  	 0.621 	 0.565 	 0.618
e_llama70b_s 	 RF  	 0.493 	 0.359 	 0.447
e_llama70b_s 	 SVM  	 0.600 	 0.549 	 0.595
e_llama70b_t 	 LR  	 0.573 	 0.515 	 0.567
e_llama70b_t 	 RF  	 0.472 	 0.346 	 0.425
e_llama70b_t 	 SVM  	 0.556 	 0.502 	 0.550


In [None]:
#@title with tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_dict = {}

# Create TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_gold = vectorizer.fit_transform(df['cleaned_html'].values)
tfidf_dict['tfidf_gold'] = tfidf_gold

vectorizer = TfidfVectorizer(max_features=5000)
tfidf_gold_s = vectorizer.fit_transform(df['english_summary'].values)
tfidf_dict['tfidf_gold_s'] = tfidf_gold_s

vectorizer = TfidfVectorizer(max_features=5000)
tfidf_gold_t = vectorizer.fit_transform(df['english_title'].values)
tfidf_dict['tfidf_gold_t'] = tfidf_gold_t

vectorizer = TfidfVectorizer(max_features=5000)
tfidf_llama70b_s = vectorizer.fit_transform(df['llama70b_summary'].values)
tfidf_dict['tfidf_llama70b_s'] = tfidf_llama70b_s

vectorizer = TfidfVectorizer(max_features=5000)
tfidf_llama70b_t = vectorizer.fit_transform(df['llama70b_title'].values)
tfidf_dict['tfidf_llama70b_t'] = tfidf_llama70b_t

# Split the tf-idf embeddings with the same random_seed and test_size
e_values = list(tfidf_dict.values())

e_splits = train_test_split(*e_values, test_size=0.2, random_state=42)

tfidf_train_dict = dict(zip(e_dict.keys(), e_splits[::2]))  # Train splits
tfidf_test_dict = dict(zip(e_dict.keys(), e_splits[1::2])) # test splits


print("input \t cl  \t f1")
for e_name in tfidf_train_dict:
  e_train = tfidf_train_dict[e_name]
  e_test = tfidf_test_dict[e_name]

  # Logistic Regression
  clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
  clf.fit(e_train, y_train)
  y_pred = clf.predict(e_test)

  # Evaluate
  micro_f1 = f1_score(y_test, y_pred, average="micro", zero_division=0)
  macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
  samples_f1 = f1_score(y_test, y_pred, average="samples", zero_division=0)
  results.append({
      'name': e_name,
      'cl': 'lr',
      'microf1': micro_f1,
      'macrof1': macro_f1,
      'samplesf1': samples_f1,
      'report': classification_report(y_test, y_pred, target_names=unique_topics, zero_division=0)
  })
  print(f"{e_name} \t LR  \t {micro_f1:.3f} \t {macro_f1:.3f} \t {samples_f1:.3f}")

  # Random Forest
  clf_rf = OneVsRestClassifier(RandomForestClassifier())
  clf_rf.fit(e_train, y_train)
  y_pred = clf_rf.predict(e_test)

  # Evaluate
  micro_f1 = f1_score(y_test, y_pred, average="micro", zero_division=0)
  macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
  samples_f1 = f1_score(y_test, y_pred, average="samples", zero_division=0)
  results.append({
      'name': e_name,
      'cl': 'rf',
      'microf1': micro_f1,
      'macrof1': macro_f1,
      'samplesf1': samples_f1,
      'report': classification_report(y_test, y_pred, target_names=unique_topics, zero_division=0)
  })
  print(f"{e_name} \t RF  \t {micro_f1:.3f} \t {macro_f1:.3f} \t {samples_f1:.3f}")

  # SVM
  clf_svm = OneVsRestClassifier(LinearSVC())
  clf_svm.fit(e_train, y_train)
  y_pred = clf_svm.predict(e_test)

  # Evaluate
  micro_f1 = f1_score(y_test, y_pred, average="micro", zero_division=0)
  macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
  samples_f1 = f1_score(y_test, y_pred, average="samples", zero_division=0)
  results.append({
      'name': e_name,
      'cl': 'svm',
      'microf1': micro_f1,
      'macrof1': macro_f1,
      'samplesf1': samples_f1,
      'report': classification_report(y_test, y_pred, target_names=unique_topics, zero_division=0)
  })
  print(f"{e_name} \t SVM  \t {micro_f1:.3f} \t {macro_f1:.3f} \t {samples_f1:.3f}")

input 	 cl  	 f1
e_gols 	 LR  	 0.365 	 0.216 	 0.305
e_gols 	 RF  	 0.511 	 0.406 	 0.454
e_gols 	 SVM  	 0.581 	 0.490 	 0.537
e_gold_s 	 LR  	 0.387 	 0.256 	 0.307
e_gold_s 	 RF  	 0.474 	 0.347 	 0.413
e_gold_s 	 SVM  	 0.577 	 0.462 	 0.541
e_gold_t 	 LR  	 0.369 	 0.242 	 0.295
e_gold_t 	 RF  	 0.460 	 0.322 	 0.392
e_gold_t 	 SVM  	 0.568 	 0.472 	 0.520
e_llama70b_s 	 LR  	 0.432 	 0.309 	 0.370
e_llama70b_s 	 RF  	 0.492 	 0.371 	 0.443
e_llama70b_s 	 SVM  	 0.588 	 0.505 	 0.553
e_llama70b_t 	 LR  	 0.394 	 0.270 	 0.328
e_llama70b_t 	 RF  	 0.498 	 0.381 	 0.435
e_llama70b_t 	 SVM  	 0.561 	 0.460 	 0.518


In [None]:
# view multi-class aggregated_results
results_df = pd.DataFrame(results)
results_df[['microf1', 'macrof1', 'samplesf1']] = results_df[['microf1', 'macrof1', 'samplesf1']].round(3)

# Sort by samplesf1
sorted_results_df = results_df.sort_values(by=['samplesf1', 'macrof1', 'microf1'], ascending=False)
# Print top systems
sorted_results_df.head(6)

Unnamed: 0,name,cl,microf1,macrof1,samplesf1,report
0,e_gols,lr,0.652,0.594,0.641,...
9,e_llama70b_s,lr,0.621,0.565,0.618,...
2,e_gols,svm,0.615,0.554,0.604,...
3,e_gold_s,lr,0.611,0.549,0.6,...
11,e_llama70b_s,svm,0.6,0.549,0.595,...
6,e_gold_t,lr,0.583,0.542,0.571,...


In [None]:
# @title cl reports for 3-top systems with LR classifier
# Filter the DataFrame for rows where the 'cl' column is 'lr'
lr_results = sorted_results_df[sorted_results_df['cl'] == 'lr']

# Get the classification reports for the first three rows
for index, row in lr_results.head(3).iterrows():
    print(f"Classification Report for {row['name']}:")
    print(row['report'])
    print("-" * 50)  # Separator for clarity

Classification Report for e_gols:
                                              precision    recall  f1-score   support

      Additives, nutrition and organic foods       0.70      0.72      0.71        87
                          Alerts and recalls       0.76      0.85      0.80        26
                       Animals & Animal Feed       0.71      0.61      0.65        28
Contaminants, residues and contact materials       0.78      0.82      0.80       119
                    Food Safety and Security       0.55      0.55      0.55       103
                                   Labelling       0.82      0.72      0.76       102
                   Methods and Manufacturing       0.45      0.27      0.34        56
                           Policies and Laws       0.64      0.63      0.64       185
                                   Standards       0.64      0.59      0.62        27
                             Substance usage       0.00      0.00      0.00         0
                   

## Topic-Based Doc Retrieval
* Topic query to Doc (or Summary)

In [None]:
#@title load the e_model
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer(
    "billatsectorflow/stella_en_1.5B_v5",
    trust_remote_code=True,
    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False}).cuda()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/329 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/169k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

modeling_qwen.py:   0%|          | 0.00/65.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/billatsectorflow/stella_en_1.5B_v5:
- modeling_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

tokenization_qwen.py:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/billatsectorflow/stella_en_1.5B_v5:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

2_Dense_1024/config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]



In [None]:
#@title set queries
topic_queries = {
    "Additives, nutrition and organic foods": (
        "Focus on passages relevant to 'Additives, nutrition and organic foods'. "
        "Include details about food additives, nutritional value, organic farming practices, "
        "health effects, and related regulations."
    ),
    "Alerts and recalls": (
        "Focus on passages relevant to 'Alerts and recalls'. "
        "Look for passages discussing recent food safety alerts, recall notices, affected food products, "
        "and consumer notifications."
    ),
    "Animals & Animal Feed": (
        "Focus on passages relevant to 'Animals & Animal Feed'. "
        "Include topics like feed safety, regulations on animal nutrition, veterinary care, "
        "and animal health management in food production."
    ),
    "Contaminants, residues and contact materials": (
        "Focus on passages relevant to 'Contaminants, residues and contact materials'. "
        "Include topics on food contamination risks, testing for chemical residues, "
        "and materials in contact with food that could pose contamination hazards."
    ),
    "Food Safety and Security": (
        "Focus on passages relevant to 'Food Safety and Security'. "
        "Look for passages discussing food security policies, regulations to prevent contamination, "
        "and safety measures for ensuring food quality in the supply chain."
    ),
    "Labelling": (
        "Focus on passages relevant to 'Labelling'. "
        "Include details on food labeling regulations, nutritional information disclosure, "
        "ingredient transparency, and legal standards for food labels."
    ),
    "Methods and Manufacturing": (
        "Focus on passages relevant to 'Methods and Manufacturing'. "
        "Include details on food production methods, manufacturing processes, quality control measures, "
        "and the role of technology in food manufacturing."
    ),
    "Policies and Laws": (
        "Focus on passages relevant to 'Policies and Laws'. "
        "Include information about food safety regulations, industry laws, policy changes affecting food production, "
        "and enforcement mechanisms."
    ),
    "Standards": (
        "Focus on passages relevant to 'Standards'. "
        "Include passages that cover food safety standards, industry guidelines, quality benchmarks, "
        "and compliance with international food safety organizations."
    ),
    "Sustainability": (
        "Focus on passages relevant to 'Sustainability'. "
        "Look for information on sustainable food production practices, eco-friendly farming, waste reduction strategies, "
        "and environmental impact of food systems."
    ),
    "Trade, market and official controls": (
        "Focus on passages relevant to 'Trade, market and official controls'. "
        "Include topics such as international food trade regulations, market monitoring practices, "
        "and official controls over food production and distribution."
    )
}

In [None]:
#@title generate query e
query_prompt_name = "s2p_query"
query_embeddings = model.encode(topic_queries.values(), prompt_name=query_prompt_name)
# Save the embeddings to a pickle file
with open('/content/drive/MyDrive/agroknow/ir+nlp/stella1.5B_emb_topic_queries.pkl', 'wb') as f:
    pickle.dump(query_embeddings, f)

In [None]:
#@title load query e
!gdown 1WfN_CeQD5FxVCt5kub7qlqw3sFRZ2G2h
with open('stella1.5B_emb_topic_queries.pkl', 'rb') as f:
    query_embeddings = pickle.load(f)

Downloading...
From: https://drive.google.com/uc?id=1WfN_CeQD5FxVCt5kub7qlqw3sFRZ2G2h
To: /content/stella1.5B_emb_topic_queries.pkl
  0% 0.00/45.2k [00:00<?, ?B/s]100% 45.2k/45.2k [00:00<00:00, 53.5MB/s]


In [None]:
#@title load similarities matrices dict
!gdown 1WJFt20ADfwsGyAOLi_fVyRYKvpsUi9qz
with open('stella1.5B_query_similarities.pkl', 'rb') as f:
    query_similarities = pickle.load(f)

In [None]:
#@title eval
for k in range(2, 7):
  # Initialize lists to hold Precision@k
  for e_name, emb in e_dict.items():
      # Compute similarities between the query embeddings and the test data
      # similarities = model.similarity(query_embeddings, emb)
      similarities = query_similarities[e_name.replace('e_gols', 'e_gold')]
      precision_at_k_sum = 0
      total_queries = similarities.shape[0]

      # Loop through each query
      for i in range(total_queries):
          # Get the indices of the top k most similar documents
          top_k_indices = np.argsort(-similarities[i])[:k]
          # Calculate Precision@k: Proportion of relevant documents in the top k
          top_k_labels = y[top_k_indices]
          relevant_docs_at_k = np.sum(top_k_labels[:, i])
          # Calculate Precision@k for this query
          precision_at_k = relevant_docs_at_k / k
          precision_at_k_sum += precision_at_k

      # Calculate Precision@k for the current dataset
      precision_at_k_avg = precision_at_k_sum / total_queries
      print(f"{k} \t {e_name} \t {precision_at_k_avg:.3f}")
  print()



2 	 e_gols 	 0.182
2 	 e_gold_s 	 0.227
2 	 e_gold_t 	 0.364
2 	 e_llama70b_s 	 0.182
2 	 e_llama70b_t 	 0.273

3 	 e_gols 	 0.182
3 	 e_gold_s 	 0.152
3 	 e_gold_t 	 0.333
3 	 e_llama70b_s 	 0.152
3 	 e_llama70b_t 	 0.333

4 	 e_gols 	 0.227
4 	 e_gold_s 	 0.159
4 	 e_gold_t 	 0.295
4 	 e_llama70b_s 	 0.136
4 	 e_llama70b_t 	 0.341

5 	 e_gols 	 0.218
5 	 e_gold_s 	 0.145
5 	 e_gold_t 	 0.255
5 	 e_llama70b_s 	 0.182
5 	 e_llama70b_t 	 0.309

6 	 e_gols 	 0.212
6 	 e_gold_s 	 0.152
6 	 e_gold_t 	 0.227
6 	 e_llama70b_s 	 0.152
6 	 e_llama70b_t 	 0.273



## Summary-Based Doc Retrieval
* Summary to Doc

In [None]:
from sklearn.preprocessing import normalize

def compute_precision_at_k_matrix(summary_embeddings, document_embeddings, k):
    """
    Compute precision@k using matrix-based cosine similarity.

    :param summary_embeddings: Matrix of shape (num_summaries, embedding_dim).
    :param document_embeddings: Matrix of shape (num_documents, embedding_dim).
    :param original_doc_ids: List of indices mapping summaries to their source documents.
    :param k: Number of documents to retrieve.

    :return: List of precision@k scores for each summary.
    """
    # Normalize embeddings to unit vectors (important for cosine similarity)
    summary_embeddings = normalize(summary_embeddings, axis=1)
    document_embeddings = normalize(document_embeddings, axis=1)

    # Compute cosine similarity matrix (shape: num_summaries x num_documents)
    similarity_matrix = np.dot(summary_embeddings, document_embeddings.T)

    # Get top-k indices for each summary (shape: num_summaries x k)
    top_k_indices = np.argsort(similarity_matrix, axis=1)[:, -k:][:, ::-1]

    # Compute precision@k
    precision_at_k_scores = []
    for i, top_k in enumerate(top_k_indices):
        # Check if the index of the summary is in the top-k indices
        if i in top_k:
            precision_at_k_scores.append(1)
        else:
            precision_at_k_scores.append(0)

    return precision_at_k_scores


for k in range(1, 15):
  for e_name, e_summ in e_dict.items():
    if e_name in ['e_gols', 'e_gold']:continue

    # Compute precision@k
    precision_scores = compute_precision_at_k_matrix(e_summ, e_dict['e_gols'], k)

    # Compute mean precision@k
    mean_precision_at_k = np.mean(precision_scores)
    print(f"{k} \t {e_name} \t {mean_precision_at_k:.3f}")
  print()


1 	 e_gold_s 	 0.847
1 	 e_gold_t 	 0.843
1 	 e_llama70b_s 	 0.934
1 	 e_llama70b_t 	 0.846

2 	 e_gold_s 	 0.906
2 	 e_gold_t 	 0.913
2 	 e_llama70b_s 	 0.970
2 	 e_llama70b_t 	 0.909

3 	 e_gold_s 	 0.927
3 	 e_gold_t 	 0.937
3 	 e_llama70b_s 	 0.978
3 	 e_llama70b_t 	 0.932

4 	 e_gold_s 	 0.941
4 	 e_gold_t 	 0.947
4 	 e_llama70b_s 	 0.982
4 	 e_llama70b_t 	 0.942

5 	 e_gold_s 	 0.947
5 	 e_gold_t 	 0.952
5 	 e_llama70b_s 	 0.986
5 	 e_llama70b_t 	 0.950

6 	 e_gold_s 	 0.953
6 	 e_gold_t 	 0.956
6 	 e_llama70b_s 	 0.987
6 	 e_llama70b_t 	 0.954

7 	 e_gold_s 	 0.956
7 	 e_gold_t 	 0.960
7 	 e_llama70b_s 	 0.991
7 	 e_llama70b_t 	 0.959

8 	 e_gold_s 	 0.959
8 	 e_gold_t 	 0.962
8 	 e_llama70b_s 	 0.992
8 	 e_llama70b_t 	 0.962

9 	 e_gold_s 	 0.960
9 	 e_gold_t 	 0.964
9 	 e_llama70b_s 	 0.992
9 	 e_llama70b_t 	 0.964

10 	 e_gold_s 	 0.962
10 	 e_gold_t 	 0.966
10 	 e_llama70b_s 	 0.992
10 	 e_llama70b_t 	 0.968

11 	 e_gold_s 	 0.963
11 	 e_gold_t 	 0.969
11 	 e_llama70b_s 	 0.