In [None]:
!pip install sentence-transformers

In [66]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [81]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words and words with length <= 2 characters
    tokens = [token for token in tokens if token not in stopwords.words('english') and len(token) > 2]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove non-ASCII characters
    tokens = [token for token in tokens if all(ord(character) < 128 for character in token)]

    # Removing brackets but keeping content
    text = ' '.join(tokens).replace("[", "").replace("]", "").replace("{", "").replace("}", "").replace("(", "").replace(")", "")
    
    return text


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [96]:
# 1. Load the data points
with open('intents.json', 'r') as f:
    intents_data = json.load(f)

def build_corpus_and_mapping(intents_data):
    corpus = []
    intent_mapping = []
    for intent in intents_data:
        for example in intent["examples"]:
            # Preprocess the example utterance before adding to the corpus
            example = preprocess_text(example)
            corpus.append(example)
            intent_mapping.append(intent["name"])
    return corpus, intent_mapping

corpus, intent_mapping = build_corpus_and_mapping(intents_data)

In [97]:
# 2. Vectorize the corpus using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)

# Initialize the SentenceTransformer model
st_model = SentenceTransformer('paraphrase-distilroberta-base-v1')
st_corpus_embeddings = st_model.encode(corpus)

In [98]:
def match_intent(utterance, method='tfidf'):
    
    if method == 'tfidf':
        utterance_vec = vectorizer.transform([preprocess_text(utterance)])
        cosine_similarities = linear_kernel(utterance_vec, tfidf_matrix).flatten()
        matched_index = cosine_similarities.argmax()
        
    elif method == 'sent-transformer':
        utterance_embedding = st_model.encode(preprocess_text(utterance))
        cosine_scores = util.pytorch_cos_sim(utterance_embedding, st_corpus_embeddings).flatten()
        matched_index = cosine_scores.argmax()
    else:
        raise ValueError("Invalid method!")
    return intent_mapping[matched_index]


In [99]:
def evaluate(utterances_for_eval, true_intents, method='tfidf'):
    predicted_intents = [match_intent(utterance, method) for utterance in utterances_for_eval]
    correct_predictions = sum(1 for true, pred in zip(true_intents, predicted_intents) if true == pred)
    incorrect_predictions = len(true_intents) - correct_predictions
    accuracy = correct_predictions / len(true_intents) * 100
    return correct_predictions, incorrect_predictions, accuracy

# Evaluation

In [100]:
# 3. Load the evaluation data
with open('utterances.json', 'r') as f:
    eval_data = json.load(f)

true_intents = [item["name"] for item in eval_data]
utterances_for_eval = [item["utterance"] for item in eval_data]

In [101]:
def display_results(method):
    correct_predictions, incorrect_predictions, accuracy = evaluate(utterances_for_eval, true_intents, method=method)
    print(f"Results using {method} method:")
    print(f"Number of Correct Predictions: {correct_predictions}")
    print(f"Number of Incorrect Predictions: {incorrect_predictions}")
    print(f"Accuracy: {accuracy:.2f}%")
    print("-----------------------------")


In [102]:
methods = ['tfidf', 'sent-transformer']
for method in methods:
    display_results(method)

Results using tfidf method:
Number of Correct Predictions: 87
Number of Incorrect Predictions: 7
Accuracy: 92.55%
-----------------------------
Results using sent-transformer method:
Number of Correct Predictions: 90
Number of Incorrect Predictions: 4
Accuracy: 95.74%
-----------------------------
