# Cosine Similarity Example

In [1]:
import requests
from dataclasses import dataclass
from typing import Any, Dict
from sentence_transformers import SentenceTransformer, util

@dataclass
class CosineSimilarity:
    api_token: str
    API_URL: str = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"

    def __post_init__( self ):
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    def headers(self) -> Dict[str, str]:
        return {"Authorization": f"Bearer {self.api_token}"}

    def query(self, payload: Dict[str, Any]) -> Dict[str, Any]:
        response = requests.post(self.API_URL, headers=self.headers(), json=payload)
        return response.json()

    def get_similarity_score(self, gold_intent: str, pred_intent: str) -> float:
        data = self.query(
            {
                "inputs": {
                    "source_sentence": gold_intent,
                    "sentences": [pred_intent]
                }
            })
        return data[0]
    
    def get_cosine_similarity(self, gold_intent: str, pred_intent: str) -> float:

        #Compute embedding for both lists
        embedding_1 = self.model.encode( gold_intent, convert_to_tensor=True)
        embedding_2 = self.model.encode( pred_intent, convert_to_tensor=True)
        sim = util.pytorch_cos_sim(embedding_1, embedding_2).item()
        return sim


    def compare(self, gold_intent: str, pred_intent: str) -> None:
        cosine_sim = round(self.get_similarity_score(gold_intent, pred_intent) * 100, 2)
        print(f"gold: {gold_intent}\npred: {pred_intent}\nmatch: {cosine_sim}%\n")

    def compare_embed(self, gold_intent: str, pred_intent: str) -> None:
        cosine_sim = round(self.get_cosine_similarity(gold_intent, pred_intent) * 100, 2)
        print(f"gold: {gold_intent}\npred: {pred_intent}\nmatch: {cosine_sim}%\n")

print("---[Using HuggingFace API]---")

#Cosine Similarity Example
similarity_checker = CosineSimilarity( api_token="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" )
similarity_checker.compare( "Book Flight", "Book Plane." )
similarity_checker.compare( "Book Flight", "Book Airplane Reservation." )

print("---[Using Local Embeddings]---")

#Embedding Similarity Example
similarity_checker.compare_embed( "Book Flight", "Book Plane." )
similarity_checker.compare_embed( "Book Flight", "Book Airplane Reservation." )

---[Using HuggingFace API]---
gold: Book Flight
pred: Book Plane.
match: 78.49%

gold: Book Flight
pred: Book Airplane Reservation.
match: 74.98%

---[Using Local Embeddings]---
gold: Book Flight
pred: Book Plane.
match: 78.49%

gold: Book Flight
pred: Book Airplane Reservation.
match: 74.98%



# Accuracy Example

In [8]:
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

@dataclass
class EvaluationMetricsDemo:
  pred_file: str
  gold_file: str
  embed_handler: CosineSimilarity 

  def is_match(self, gold_intent: str, pred_intent: list) -> bool:
        return gold_intent == pred_intent
    
  def first_match(self, gold_intent: str, pred_intent: list) -> bool:
     return gold_intent.split()[0] == pred_intent.split()[0]

  def exist(self, gold_intent: str, pred_intent: list) -> bool:
    return len( pred_intent ) > 0 and len( gold_intent ) > 0

  def calculate_accuracy(self) -> None:
    with open(self.pred_file, "r") as pred_f, open(self.gold_file, "r") as gold_f:
      pred_lines = pred_f.readlines()
      gold_lines = gold_f.readlines()
      
      assert len(pred_lines) == len(gold_lines)

    total: float = 0.0
    first_word_correct: float = 0.0
    exact_match: float = 0.0

    for pred_line, gold_line in zip(pred_lines, gold_lines):
        if self.gold_file.endswith("json"):
            gold_intent = json.loads(gold_line)["translation"]["tgt"]
        else:
            gold_intent = gold_line.strip()

        pred_intent = pred_line.strip()
    
        total += 1.0
        if self.first_match(gold_intent, pred_intent):
            first_word_correct += 1.0
        if self.exist( gold_intent, pred_intent ) and self.is_match( gold_intent, pred_intent ):
            exact_match += 1.0

    first_word_correct = round( first_word_correct / total * 100, 2 )
    exact_match = round( exact_match / total * 100, 2 )

    return first_word_correct, exact_match

  def calculate_bleu_score(self) -> None:
    smoothie = SmoothingFunction().method1 
    with open(self.pred_file, "r") as pred_f, open(self.gold_file, "r") as gold_f:
        pred_lines = pred_f.readlines()
        gold_lines = gold_f.readlines()

        assert len(pred_lines) == len(gold_lines)

    total: float = 0.0
    blue_scores: list = []

    for pred_line, gold_line in zip(pred_lines, gold_lines):
        if self.gold_file.endswith("json"):
            gold_intent = json.loads(gold_line)["translation"]["tgt"]
        else:
            gold_intent = gold_line.strip()
        pred_intent = pred_line.strip()

        total += 1.0
        reference = [gold_intent.split()]
        hypothesis = pred_intent.split()
        blue_scores.append(sentence_bleu(reference, hypothesis, smoothing_function=smoothie))

    blue_score = sum(blue_scores) / total * 100
    
    return blue_score
  
  def jaccard_similarity(self, label1: str, label2: str) -> float:
    # Tokenize the intent labels
    tokens1 = set(label1.split())
    tokens2 = set(label2.split())

    # Calculate Jaccard similarity
    intersection = len(tokens1.intersection(tokens2))
    union = len(tokens1.union(tokens2))
    similarity = intersection / union if union != 0 else 0.0

    return similarity

  def calculate_jaccard_similarity(self) -> None:
    with open(self.pred_file, "r") as pred_f, open(self.gold_file, "r") as gold_f:
        pred_lines = pred_f.readlines()
        gold_lines = gold_f.readlines()

        assert len(pred_lines) == len(gold_lines)

    total: float = 0.0
    jaccard_scores: list = []

    for pred_line, gold_line in zip(pred_lines, gold_lines):
        if self.gold_file.endswith("json"):
            gold_intent = json.loads(gold_line)["translation"]["tgt"]
        else:
            gold_intent = gold_line.strip()
        pred_intent = pred_line.strip()

        total += 1.0
        jaccard_scores.append( self.jaccard_similarity( gold_intent, pred_intent ) )

    jaccard_score = sum(jaccard_scores) / total * 100
    
    return jaccard_score
  
  def cosine_similarity(self, label1: str, label2: str) -> float:
    return self.embed_handler.get_cosine_similarity( label1, label2 )

  def calculate_cosine_similarity(self) -> None:
    with open(self.pred_file, "r") as pred_f, open(self.gold_file, "r") as gold_f:
        pred_lines = pred_f.readlines()
        gold_lines = gold_f.readlines()

        assert len(pred_lines) == len(gold_lines)

    cosine_scores: int = 0
    total: float = 0.0

    for pred_line, gold_line in zip(pred_lines, gold_lines):
        if self.gold_file.endswith("json"):
            gold_intent = json.loads(gold_line)["translation"]["tgt"]
        else:
            gold_intent = gold_line.strip()
        pred_intent = pred_line.strip()
        total += 1.0

        score = self.cosine_similarity( gold_intent, pred_intent )
        if score >= 0.70:
            cosine_scores += 1

    return cosine_scores / total
    

  def evaluate(self) -> dict:
    accuracy = self.calculate_accuracy()
    bleu_score = self.calculate_bleu_score()
    jaccard_score = self.calculate_jaccard_similarity()
    cosine_scores = self.calculate_cosine_similarity()

    metrics = {
        'accuracy':  {
            'first_word':  accuracy[0],
            'exact_match': accuracy[1]
        },
        'bleu_score': bleu_score,
        'jaccard_score': jaccard_score,
        'cosine_similarity': cosine_scores
    }

    return metrics

In [9]:
metrics = EvaluationMetricsDemo( 
    embed_handler=similarity_checker,
    gold_file="results/Labels_gold_silver_model_3_1_full_resource.txt",
    pred_file="results/Preds_gold_silver_model_3_1_full_resource.txt" )
metrics.evaluate()

{'accuracy': {'first_word': 98.86, 'exact_match': 98.86},
 'bleu_score': 17.57956216781354,
 'jaccard_score': 98.85714285714286,
 'cosine_similarity': 0.9885714285714285}

In [None]:
first_word_correct, exact_match = metrics.calculate_accuracy()
print( f"First Word Accuracy: {first_word_correct}%" )
print( f"Exact Match Accuracy: {exact_match}%" )

First Word Accuracy: 98.86%
Exact Match Accuracy: 98.86%


In [None]:
blue_score = metrics.calculate_bleu_score()
print( f"BLEU Score: {blue_score}%" )

BLEU Score: 17.57956216781354%


In [None]:
avg_jaccard = metrics.calculate_jaccard_similarity()
print( f"Average Jaccard Similarity: {avg_jaccard}%" )

Average Jaccard Similarity: 98.85714285714286%


# Run Across All Files
---
Check accuracy across all n-shot settings.

In [11]:
import os

def get_results( path: str ) -> dict:
    all_files = os.listdir( path )
    labels = [ file for file in all_files if file.startswith( "Labels" ) ]
    preds  = [ file.replace( "Labels", "Preds" ) for file in labels ]
    for label, pred in zip( labels, preds ):
        metrics = EvaluationMetricsDemo(
                embed_handler=similarity_checker,
                gold_file=f"{path}/{label}",
                pred_file=f"{path}/{pred}"
        )
        results = metrics.evaluate()
        print( f"Results for {label} and {pred}:" )
        print( results )
        print( "\n" )
        
#get all folders under SNIPS
folders = os.listdir( "SNIPS" )
for folder in folders:
    get_results( f"SNIPS/{folder}" )

Results for Labels_gold_silver_model_2_eight_shot.txt and Preds_gold_silver_model_2_eight_shot.txt:
{'accuracy': {'first_word': 90.71, 'exact_match': 88.86}, 'bleu_score': 36.46701448101957, 'jaccard_score': 89.22857142857151, 'cosine_similarity': 0.8885714285714286}


Results for Labels_gold_silver_model_2_four_shot.txt and Preds_gold_silver_model_2_four_shot.txt:
{'accuracy': {'first_word': 88.71, 'exact_match': 82.43}, 'bleu_score': 35.06086900567161, 'jaccard_score': 83.74047619047613, 'cosine_similarity': 0.8242857142857143}


Results for Labels_gold_silver_model_2_full_resource.txt and Preds_gold_silver_model_2_full_resource.txt:
{'accuracy': {'first_word': 98.0, 'exact_match': 96.71}, 'bleu_score': 40.82048378657413, 'jaccard_score': 96.97142857142863, 'cosine_similarity': 0.9671428571428572}


Results for Labels_gold_silver_model_2_one_shot.txt and Preds_gold_silver_model_2_one_shot.txt:
{'accuracy': {'first_word': 61.29, 'exact_match': 54.0}, 'bleu_score': 25.68582631106849, '

In [6]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from collections import defaultdict
import os
import json
import sys
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

def load_intents_json(intents_file):
    intents = []
    with open(intents_file, 'r') as examples:
        for example in examples:
            data = json.loads(example)
            intent = data["translation"]["tgt"]
            intents.append(intent)
    return intents

def load_intents_list(intents_file):
    intents = []
    with open(intents_file, 'r') as intent_preds:
        for intent in intent_preds:
            intents.append(intent.strip())
    return intents


confmat_name = "snips"

true_intents = load_intents_list("SNIPS/model_2 results/Labels_gold_silver_model_2_one_shot.txt")
pred_intents = load_intents_list("SNIPS/model_2 results/Preds_gold_silver_model_2_one_shot.txt")


# replace all bad generated intents with BAD GENERATION
intent_set = set(true_intents)
for idx, pred in enumerate(pred_intents):
    if pred not in intent_set:
        pred_intents[idx] = "ε"

# FOR DEBUGGING
print(set(true_intents), set(pred_intents))
for intent in set(true_intents):
    if intent not in set(pred_intents):
        pred_intents.append(intent)
        true_intents.append("ε")

if "ε" not in set(true_intents):
    true_intents.append("ε")
    pred_intents.append("ε")

precisions = defaultdict(list)
recalls = defaultdict(list)
f1s = defaultdict(list)
precs, recs, f1s, supports = precision_recall_fscore_support(true_intents, pred_intents)

print("Precisions: ", precs)
print("Recalls: ", recs)
print("F1 scores: ", f1s)

# cmap = sns.color_palette("Reds", 256)
cmap = sns.color_palette("Blues", 256)

out_path = "confmat_figures"
if not os.path.exists(out_path):
    os.makedirs(out_path)

#sns.set_context("paper", rc={"font.size":18,"axes.labelsize":18})
sns.set(font_scale = 1.0)

data = confusion_matrix(true_intents, pred_intents)
print(confusion_matrix)
df_cm = pd.DataFrame(data, columns=np.unique(pred_intents), index=np.unique(true_intents))
df_cm.index.name = "True Intent"
df_cm.columns.name = "Predicted Intent"

# non-normalized figure
ax = sns.heatmap(df_cm, cmap=cmap, annot=False)
ax.xaxis.tick_top() # x axis on top
ax.xaxis.set_label_position('top')
plt.xticks(rotation=45, ha='left')
plt.savefig(os.path.join(out_path, f"conf_mat_{confmat_name}_1shot.png"), format="png", \
    bbox_inches='tight', pad_inches=0.01)
plt.cla(); plt.clf()

# # normalized figure
import numpy as np

df_cmn = df_cm.astype('float') / np.array(df_cm.sum(axis=1))[:, np.newaxis]

df_cmn.index.name = "True Intent"
df_cmn.columns.name = "Predicted Intent"
ax = sns.heatmap(df_cmn, cmap=cmap, annot=False, vmax=1.0)
ax.xaxis.tick_top() # x axis on top
ax.xaxis.set_label_position('top')
ax.tick_params(left=True)
plt.xticks(rotation=45, ha='left')
plt.savefig(os.path.join(out_path, f"conf_mat_{confmat_name}_1shot_norm.png"), format="png", \
    bbox_inches='tight', pad_inches=0.01)
plt.cla(); plt.clf()

{'Play Music.', 'Get Weather.', 'Search Screening Event.', 'Book Restaurant.', 'Rate Book.', 'Search Creative Work.', 'Add To Playlist.'} {'Play Music.', 'ε', 'Get Weather.', 'Search Screening Event.', 'Book Restaurant.', 'Rate Book.', 'Search Creative Work.', 'Add To Playlist.'}
Precisions:  [1.         1.         0.984375   0.66666667 1.         0.37219731
 0.72881356 0.00625   ]
Recalls:  [0.89 0.62 0.63 0.12 0.26 0.83 0.43 1.  ]
F1 scores:  [0.94179894 0.7654321  0.76829268 0.20338983 0.41269841 0.51393189
 0.5408805  0.01242236]
<function confusion_matrix at 0x000002C1C6392290>


<Figure size 640x480 with 0 Axes>