In [None]:
!pip install sentence-transformers

In [257]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sentence_transformers import SentenceTransformer, util
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [258]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words and words with length <= 2 characters
    tokens = [token for token in tokens if token not in stopwords.words('english') and len(token) > 2]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove non-ASCII characters
    tokens = [token for token in tokens if all(ord(character) < 128 for character in token)]

    # Removing brackets but keeping content
    text = ' '.join(tokens).replace("[", "").replace("]", "").replace("{", "").replace("}", "").replace("(", "").replace(")", "")
    
    return text


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [259]:
# 1. Load the data points
with open('./data/intents.json', 'r') as f:
    intents_data = json.load(f)

def build_corpus_and_mapping(intents_data):
    corpus = []
    intent_mapping = []
    for intent in intents_data:
        for example in intent["examples"]:
            # Preprocess the example utterance before adding to the corpus
            example = preprocess_text(example)
            corpus.append(example)
            intent_mapping.append(intent["name"])
    return corpus, intent_mapping

corpus, intent_mapping = build_corpus_and_mapping(intents_data)

In [260]:
# 2. Vectorize the corpus using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)

# Initialize the SentenceTransformer model
st_model = SentenceTransformer('paraphrase-distilroberta-base-v1')
st_corpus_embeddings = st_model.encode(corpus)

In [261]:
def match_intent(utterance, method='tfidf'):
    
    if method == 'tfidf':
        utterance_vec = vectorizer.transform([preprocess_text(utterance)])
        cosine_similarities = linear_kernel(utterance_vec, tfidf_matrix).flatten()
        matched_index = cosine_similarities.argmax()
        
    elif method == 'sent-transformer':
        utterance_embedding = st_model.encode(preprocess_text(utterance))
        cosine_scores = util.pytorch_cos_sim(utterance_embedding, st_corpus_embeddings).flatten()
        matched_index = cosine_scores.argmax()
    else:
        raise ValueError("Invalid method!")
    return intent_mapping[matched_index]


# Evaluation

### During Modeling 

In [262]:
def evaluate(utterances_for_eval, true_intents, method='tfidf'):
    predicted_intents = [match_intent(utterance, method) for utterance in utterances_for_eval]
    correct_predictions = sum(1 for true, pred in zip(true_intents, predicted_intents) if true == pred)
    incorrect_predictions = len(true_intents) - correct_predictions
    accuracy = correct_predictions / len(true_intents) * 100
    return correct_predictions, incorrect_predictions, accuracy

In [263]:
# 3. Load the evaluation data
with open('./data/utterances.json', 'r') as f:
    eval_data = json.load(f)

true_intents = [item["name"] for item in eval_data]
utterances_for_eval = [item["utterance"] for item in eval_data]

In [264]:
def display_results(method):
    correct_predictions, incorrect_predictions, accuracy = evaluate(utterances_for_eval, true_intents, method=method)
    print(f"Results using {method} method:")
    print(f"Number of Correct Predictions: {correct_predictions}")
    print(f"Number of Incorrect Predictions: {incorrect_predictions}")
    print(f"Accuracy: {accuracy:.2f}%")
    print("-----------------------------")


In [265]:
methods = ['tfidf', 'sent-transformer']
for method in methods:
    display_results(method)

Results using tfidf method:
Number of Correct Predictions: 87
Number of Incorrect Predictions: 7
Accuracy: 92.55%
-----------------------------
Results using sent-transformer method:
Number of Correct Predictions: 90
Number of Incorrect Predictions: 4
Accuracy: 95.74%
-----------------------------


### Post Deployement 

In [268]:
import json
import requests

# Load your data from the JSON file
with open('./data/intents.json', 'r') as file:
    data_points = json.load(file)

# Base URL for the FastAPI application
base_url = "http://127.0.0.1:8000/add_intent"

# Loop over each data point and send a POST request
counter = 0
for data in data_points:
    response = requests.post(base_url, json=data)
    
    if response.status_code == 200 and response.json()["state"] == "success":
        print(f"Successfully added intent: {data['name']}")
    else:
        print(f"Failed to add intent: {data['name']}. Reason: {response.json()['detail']}")

    counter += 1
    if counter % 10 == 0:
        print(f'{counter} data points processed')

Successfully added intent: book-a-ticket
Successfully added intent: check-reservation
Successfully added intent: Private-Pilot-Lessons
Successfully added intent: Tourism-Packages
Successfully added intent: Online-Tutoring
Successfully added intent: Food-and-Wine-Tours
Successfully added intent: Product-Warranty-Information
Successfully added intent: Budget-Travel-Packages
Successfully added intent: Wine-Tastings
Successfully added intent: Food-Bank-Donation
10 data points processed
Successfully added intent: Home-Cleaning
Successfully added intent: Hotel-Room-Availability
Successfully added intent: Gratitude-Journals
Successfully added intent: Medical-Billing-and-Coding
Successfully added intent: Bank-Account-Balance
Successfully added intent: Sustainable-Living
Successfully added intent: Health-and-Safety-Tips
Successfully added intent: Housing-Loan-Counseling
Successfully added intent: Trip-Organization
Successfully added intent: Transportation-Services
20 data points processed
Succe

In [269]:
import time
import json
import requests
# Load your data from the JSON file
with open('./data/utterances.json', 'r') as file:
    data_points = json.load(file)

# Base URL for the FastAPI application
base_url = "http://127.0.0.1:8000/match_intent"

# Loop over each data point and send a POST request
counter = 0

true_intents = []
predicted_intents = []

for data in data_points:

    true_intents.append(data["name"])
    response = requests.post(base_url, json={"utterance":data["utterance"]})
    intent = response.json()['intent']
    predicted_intents.append(intent)

    counter += 1
    if counter % 10 == 0:
        print(f'{counter} data points processed')

    time.sleep(1)

correct_predictions = sum(1 for true, pred in zip(true_intents, predicted_intents) if true == pred)
incorrect_predictions = len(true_intents) - correct_predictions
accuracy = correct_predictions / len(true_intents) * 100

print(f"Post Deployement(Sent-Trans + FAISS):")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Number of Incorrect Predictions: {incorrect_predictions}")
print(f"Accuracy: {accuracy:.2f}%")

10 data points processed
20 data points processed
30 data points processed
40 data points processed
50 data points processed
60 data points processed
70 data points processed
80 data points processed
90 data points processed
Post Deployement(Sent-Trans + FAISS):
Number of Correct Predictions: 90
Number of Incorrect Predictions: 4
Accuracy: 95.74%
