In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.express as px
from plotly.offline import init_notebook_mode
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
import spacy

nltk.download("omw-1.4")
tqdm.pandas()
spacy_eng = spacy.load("en_core_web_sm")
nltk.download("stopwords")
lemm = WordNetLemmatizer()
init_notebook_mode(connected=True)
sns.set_style("darkgrid")
plt.rcParams["figure.figsize"] = (20, 8)
plt.rcParams["font.size"] = 18

# Quora Question Semantic Similarity 


**Semantic similarity** is a metric defined over a set of documents or terms, where the idea of distance between items is based on the likeness of their meaning or semantic content as opposed to lexicographical similarity. 
- Semantic Similarity has various applications, such as information retrieval, text summarization, sentiment analysis, etc.
- For quora **information retrieval** serves an important purpose as users who post questions on the platform can/may find questions that are similar in meaning that have already been answered. Questions that are also semantically similar in nature can draw a user's attention to new content as well.

<div class='alert alert-info'><strong>Note: </strong>Finding similarity semantically between sentences is different than finding similarity between sentences based on common keywords. Here the sentences in question need to have same meaning to be regarded as similar in nature.</div>

# Data Cleaning and EDA
- Cleaning and preprocessing text data
- Finding insights about sentence lengths and words present in them

In [None]:
data = pd.read_csv("train.csv")
data.head(10)

## Remove Null Values

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

## Duplicate and Non-Duplicate Data Distribution
- Duplicate or similar questions are lesser in number which is to be expected as a platform for question answering will tend to have more unique questions in comparison to the questions that have been previously asked

In [None]:
fig = px.pie(
    data,
    values="id",
    names="is_duplicate",
    height=600,
    title="Proportion of Duplicate and Non Duplicate Questions",
)
fig.show()

## Text Cleaning
- Since the context of sentences are important for this NLP problem removal of stopwords might affect both the grammatical as well as semantic meaning of the sentences
- For similar reasons the words are not lemmatized or stemmed so that the semantic meaning of the sentence remains intact
- Therefore very basic cleaning is performed on the text data like removal of extra spaces and special characters

In [None]:
def text_cleaning(x):

    questions = re.sub("\s+\n+", " ", x)
    questions = re.sub("[^a-zA-Z0-9]", " ", questions)
    questions = questions.lower()

    return questions

In [None]:
data["question1_cleaned"] = data["question1"].progress_apply(text_cleaning)
data["question2_cleaned"] = data["question2"].progress_apply(text_cleaning)
data

## Sentence Length Distributions
- Here the objective is to find the ideal length of the sentence that should be used in our model
- In many cases the maximum sentence length is taken for embedding representations but by looking at the sentence length distributions a more informed decision can be made which will help in reducing the parameters of our model
- For transformers based models masks usually mask out the sentences which are short but are padded to a longer length, but since we are focused on finding the ideal length statistically we will go ahead with that

In [None]:
data["question1_lens"] = data["question1_cleaned"].apply(lambda x: len(x.split()))
data["question2_lens"] = data["question2_cleaned"].apply(lambda x: len(x.split()))

In [None]:
px.histogram(
    data,
    x="question1_lens",
    height=700,
    color="is_duplicate",
    title="Question1 Length Distribution",
    marginal="box",
)

In [None]:
px.histogram(
    data,
    x="question2_lens",
    height=700,
    color="is_duplicate",
    title="Question2 Length Distribution",
    marginal="box",
)

## Word Cloud Visualization
- Word clouds help in visually identifying the most frequent words present in the sentences which also give a brief idea what the context of the sentences are
- Two wordclouds are visualized below for both pairs of sentences 

In [None]:
question1 = data["question1_cleaned"].tolist()
question2 = data["question2_cleaned"].tolist()

In [None]:
wordcloud = WordCloud(max_words=1500, width=600, background_color="black").generate(
    " ".join(question1)
)
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Words from Question1")
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(max_words=1500, width=600, background_color="black").generate(
    " ".join(question2)
)
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Words from Question2")
plt.axis("off")
plt.show()

## Ideal Sentence Length
- By looking at both the distrbution plots and descriptive statistics it is pretty clear that taking the maximum sentence length won't make much sense of our model
- The descriptive stats also represent the likeliness of an extremely long sentence really occuring on a platform like Quora
- Since the descriptive stats of both the pairs of questions look very similar lets analyse any one of them to find the upper outlier
- Once this upper outlier is found we can choose a number nearby to it to be our ideal sentence length for our embedding representation

In [None]:
data["question1_lens"].describe()

In [None]:
data["question2_lens"].describe()

In [None]:
q1 = data["question1_lens"].quantile(0.25)
q3 = data["question1_lens"].quantile(0.75)

upper_outlier = q3 + 1.5 * (q3 - q1)
print(upper_outlier)

**Inference:** Upper outlier is 22, lets take 50 to be the ideal length so that some of the extremely long sentences can also be represented well enough for our model

# Modelling

For the purpose of performing semantic similarity we are going to use **Siamese Neural Networks**

<div class='alert alert-info'><strong>Note: </strong>A <strong>Siamese Neural Network (SNN)</strong> is a class of neural network architectures that contain two or more identical sub-networks. “Identical” here means they have the same configuration with the same parameters and weights. These networks are used to find the similarity of the inputs by comparing their feature vectors.</div>

They were termed as siamese networks due to the term Siamese twins, which comes from the twin conjoined brothers Chang and Eng Bunker who were born in Siam, now Thailand. 
- The idea of having twin neural networks with same parameter and configurations is to extract features using the same setting for two different sentences
- Followed by this setting usually a distance layer is added to calculate the distance between the feature embeddings followed by dense layers and classification head
- For our problem statement we will use two variations of the siamese network
    - Original Siamese Network with L1 Distance Layer
    - Siamese Network with Triplet Loss

<img src='https://img.freepik.com/free-photo/portrait-two-identical-siamese-cats_158595-5728.jpg?w=2000'>


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.layers import (
    Embedding,
    Layer,
    Dense,
    Dropout,
    MultiHeadAttention,
    LayerNormalization,
    Input,
    GlobalAveragePooling1D,
)
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    TFAutoModel,
    DistilBertConfig,
    TFDistilBertModel,
    BertConfig,
    TFBertModel,
    TFRobertaModel,
)
from datasets import load_dataset

## Siamese BERT
- For the backbone of our siamese neural network we will use a pretrained BERT model (since we required similar weights) 
- BERT is an open source machine learning framework for natural language processing (NLP). BERT is designed to help computers understand the meaning of ambiguous language in text by using surrounding text to establish context.
- We will use just the encoder part of the BERT model for our problem
- The BERT variant that we are going to use is the base model
- Some other alternative of sequential models like RNN based models aren't used because they do not really have the concept of "transfer learning" in them and also they are computationally very expensive to train when it comes to attention mechanism.

<img src='https://paul-hyun.github.io/assets/2020-01-02/bert-classification.png'>

In [None]:
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## BERT Text Tokenizer
- Generates
    - Padded Encodings
    - Attention Masks

In [None]:
def encode_text(text, tokenizer):

    encoded = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=True,
        max_length=50,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="tf",
    )

    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")

    return {"input_ids": input_ids, "attention_masks": attention_masks}

## Data Splitting
- 400000 data is sampled for our task
- 80:20 split is performed on the data
    - 80% for Training
    - 20% for Validation

In [None]:
data = data.sample(400000)
train = data.iloc[: int(400000 * 0.80), :]
val = data.iloc[int(400000 * 0.80) :, :]

X1_train = encode_text(train["question1_cleaned"].tolist(), tokenizer)
X2_train = encode_text(train["question2_cleaned"].tolist(), tokenizer)
X1_val = encode_text(val["question1_cleaned"].tolist(), tokenizer)
X2_val = encode_text(val["question2_cleaned"].tolist(), tokenizer)

y_train = train["is_duplicate"].values
y_val = val["is_duplicate"].values

## TPU Configuration

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = strategy.num_replicas_in_sync * 4
    print("Running on TPU:", tpu.master())
    print(f"Batch Size: {BATCH_SIZE}")

except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 32
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    print(f"Batch Size: {BATCH_SIZE}")

# Original Siamese Network with L1 Distance Layer
- This architecture uses the BERT twin backbone and applies the L1 distance on the embeddings returned by the backbone
- The L1 distance features are then fed to a dense layer to capture the non linearities
- Then the final layer is a sigmoid neuron which classfies whether the non linear activated distance features indicate if the sentences are similar or dissimilar

<img src='https://www.frontiersin.org/files/Articles/839586/fbioe-10-839586-HTML/image_m/fbioe-10-839586-g002.jpg'>

In [None]:
class L1Dist(Layer):

    def __init__(self, **kwargs):
        super().__init__()

    def call(self, embedding1, embedding2):
        return tf.math.abs(embedding1 - embedding2)

In [None]:
with strategy.scope():
    transformer_model = TFBertModel.from_pretrained(model_checkpoint)

    input_ids_in1 = Input(shape=(None,), name="input_ids1", dtype="int32")
    input_masks_in1 = Input(shape=(None,), name="attention_mask1", dtype="int32")
    input_ids_in2 = Input(shape=(None,), name="input_ids2", dtype="int32")
    input_masks_in2 = Input(shape=(None,), name="attention_mask2", dtype="int32")

    embedding_layer1 = transformer_model(
        input_ids_in1, attention_mask=input_masks_in1
    ).last_hidden_state
    embedding_layer2 = transformer_model(
        input_ids_in2, attention_mask=input_masks_in2
    ).last_hidden_state

    embedding1 = GlobalAveragePooling1D()(embedding_layer1)
    embedding2 = GlobalAveragePooling1D()(embedding_layer2)
    l1_dist = L1Dist()(embedding1, embedding2)

    x = Dense(512, activation="relu")(l1_dist)
    output = Dense(1, activation="sigmoid")(x)

    model = Model(
        inputs=[input_ids_in1, input_masks_in1, input_ids_in2, input_masks_in2],
        outputs=output,
    )
    model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
        metrics="accuracy",
    )

In [None]:
for layer in model.layers[:5]:
    layer.trainable = False

In [None]:
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
earlystopping = EarlyStopping(
    monitor="val_loss", min_delta=0, patience=5, verbose=1, restore_best_weights=True
)

learning_rate_reduction = ReduceLROnPlateau(
    monitor="val_loss", patience=3, verbose=1, factor=0.3, min_lr=0.00000001
)

In [None]:
history = model.fit(
    (
        np.asarray(X1_train["input_ids"]),
        np.asarray(X1_train["attention_masks"]),
        np.asarray(X2_train["input_ids"]),
        np.asarray(X2_train["attention_masks"]),
    ),
    y_train,
    batch_size=BATCH_SIZE,
    epochs=5,
    validation_data=(
        (
            np.asarray(X1_val["input_ids"]),
            np.asarray(X1_val["attention_masks"]),
            np.asarray(X2_val["input_ids"]),
            np.asarray(X2_val["attention_masks"]),
        ),
        y_val,
    ),
    callbacks=[earlystopping, learning_rate_reduction],
)

## Model Inference
- Learning Curves
- ROC-AUC Curves
- Confusion Matrix
- Classification Report

## Learning Curve
- Since it is a pretrained model we do not train it for longer epochs, also due to the fact that the validation loss increases after first 3 epochs the training is restricted to 5 epochs
- The best weights from the most converged state are taken forward

In [None]:
plt.figure(figsize=(20, 8))
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "val"], loc="upper left")
plt.show()

In [None]:
plt.figure(figsize=(20, 8))
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("epoch")
plt.legend(["train", "val"], loc="upper left")
plt.show()

In [None]:
y_pred = model.predict(
    (
        np.asarray(X1_val["input_ids"]),
        np.asarray(X1_val["attention_masks"]),
        np.asarray(X2_val["input_ids"]),
        np.asarray(X2_val["attention_masks"]),
    )
)

In [None]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    recall_score,
)

## ROC-AUC Curve
- The AUC score of 95% gives a clear indication about the good separability performance of our model 
- The threshold values can be experimented with to acheive the desirable number of True positives or avoiding False positives

In [None]:
plt.figure(figsize=(20, 8))
fpr, tpr, _ = roc_curve(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred)
plt.plot(fpr, tpr, label="CNN Model, auc=" + str(auc), lw=2)
plt.plot([0, 1], [0, 1], color="orange", lw=2, linestyle="--")
plt.legend(loc=4)
plt.show()

## Confusion Matrix

In [None]:
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

sns.heatmap(
    confusion_matrix(y_val, y_pred),
    cmap="viridis",
    annot=True,
    fmt=".5g",
    xticklabels=["Dissimilar", "Similar"],
    yticklabels=["Dissimilar", "Similar"],
)
plt.xlabel("Predicted Class")
plt.ylabel("Actual Class")
plt.show()

## Classification Report
- Our model achieves an F1-score of 89%
- F1-score is considered as there is a slight imbalance in the data
- Model performs slightlty less accurate for similar classes

In [None]:
print(classification_report(y_val, y_pred))

# Siamese Network with Triplet Loss
<div class='alert alert-info'><strong>Note: </strong>Using <strong>Triplet Loss</strong> we can train the network by taking an anchor text and comparing it with both a positive sample and a negative sample. The <strong>dissimilarity between the anchor text and positive text must be low</strong> and the <strong>dissimilarity between the anchor text and the negative text must be high</strong></div>

The triplet loss is defined as:

<img src='https://miro.medium.com/max/1328/1*nyfPmytStEZCijYl8OEAvQ.png'>

- Triplet loss is a loss function for machine learning algorithms where a reference input is compared to a matching input and a non-matching input. The distance from the anchor to the positive is minimized, and the distance from the anchor to the negative input is maximized.
- The max and margin m make sure different points at distance > m do not contribute to the ranking loss. This has a significant advantage over contrastive loss.


<img src='https://miro.medium.com/max/1400/1*bvBns-k7sO2sNZE3fxWLFg.png'>

## Data Preparation
- The cleaned texts are rearranged and prepared in the anchor, positve and negative format

In [None]:
data = pd.read_csv("../input/triplet-data/triplet_data.csv")
data.head(10)

In [None]:
len(data)

## Data Splitting
- 140000 samples are taken for this model
- 80:20 splitting is performed
    - 80% taken for training
    - 20% taken for validation

In [None]:
train = data.iloc[: int(140000 * 0.80), :]
val = data.iloc[int(140000 * 0.80) :, :]

X1_train = encode_text(train["question1_cleaned"].tolist(), tokenizer)
X2_train = encode_text(train["question2_cleaned"].tolist(), tokenizer)
X3_train = encode_text(train["question3_cleaned"].tolist(), tokenizer)

X1_val = encode_text(val["question1_cleaned"].tolist(), tokenizer)
X2_val = encode_text(val["question2_cleaned"].tolist(), tokenizer)
X3_val = encode_text(val["question3_cleaned"].tolist(), tokenizer)

## Custom Distance Layer
- calculates distane between anchor and positive and anchor and negative

In [None]:
class DistanceLayer(Layer):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, anchor, positive, negative):
        ap_distance = tf.reduce_sum(tf.square(anchor - positive), -1)
        an_distance = tf.reduce_sum(tf.square(anchor - negative), -1)
        return (ap_distance, an_distance)

In [None]:
from tensorflow.keras import metrics

## Custom Model
- Takes the siamese network architecture as an input and optimizes it with respect to the triplet loss

In [None]:
class SiameseModel(Model):

    def __init__(self, siamese_network, margin=0.5):
        super(SiameseModel, self).__init__()
        self.siamese_network = siamese_network
        self.margin = margin
        self.loss_tracker = metrics.Mean(name="loss")

    def call(self, inputs):
        return self.siamese_network(inputs)

    def train_step(self, data):

        with tf.GradientTape() as tape:
            loss = self._compute_loss(data)

        gradients = tape.gradient(loss, self.siamese_network.trainable_weights)
        self.optimizer.apply_gradients(
            zip(gradients, self.siamese_network.trainable_weights)
        )
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        loss = self._compute_loss(data)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def _compute_loss(self, data):
        ap_distance, an_distance = self.siamese_network(data)
        loss = ap_distance - an_distance
        loss = tf.maximum(loss + self.margin, 0.0)
        return loss

    @property
    def metrics(self):
        return [self.loss_tracker]

In [None]:
with strategy.scope():
    transformer_model = TFBertModel.from_pretrained(model_checkpoint)

    input_ids_in1 = Input(shape=(50,), name="input_ids1", dtype="int32")
    input_masks_in1 = Input(shape=(50,), name="attention_mask1", dtype="int32")

    anchor_input = Input(name="anchor_ids", shape=(50,), dtype="int32")
    anchor_masks = Input(name="anchor_mask", shape=(50,), dtype="int32")

    positive_input = Input(name="positive_ids", shape=(50,), dtype="int32")
    positive_masks = Input(name="positive_mask", shape=(50,), dtype="int32")

    negative_input = Input(name="negative_ids", shape=(50,), dtype="int32")
    negative_masks = Input(name="negative_mask", shape=(50,), dtype="int32")

    embedding_layer = transformer_model(
        input_ids_in1, attention_mask=input_masks_in1
    ).last_hidden_state

    average = GlobalAveragePooling1D()(embedding_layer)
    embeds = Dense(512, activation="relu")(average)

    embeddings = Model(inputs=[input_ids_in1, input_masks_in1], outputs=embeds)

    for layer in embeddings.layers[:-1]:
        layer.trainable = False

    embeds1 = embeddings([anchor_input, anchor_masks])
    embeds2 = embeddings([positive_input, positive_masks])
    embeds3 = embeddings([negative_input, negative_masks])

    distances = DistanceLayer()(embeds1, embeds2, embeds3)

    siamese_network = Model(
        inputs=[
            anchor_input,
            anchor_masks,
            positive_input,
            positive_masks,
            negative_input,
            negative_masks,
        ],
        outputs=distances,
    )

    siamese_model = SiameseModel(siamese_network)
    siamese_model.compile(optimizer=tf.keras.optimizers.Adam(0.00001))
    history = siamese_model.fit(
        (
            np.asarray(X1_train["input_ids"]),
            np.asarray(X1_train["attention_masks"]),
            np.asarray(X2_train["input_ids"]),
            np.asarray(X2_train["attention_masks"]),
            np.asarray(X3_train["input_ids"]),
            np.asarray(X3_train["attention_masks"]),
        ),
        epochs=10,
        validation_data=(
            (
                np.asarray(X1_val["input_ids"]),
                np.asarray(X1_val["attention_masks"]),
                np.asarray(X2_val["input_ids"]),
                np.asarray(X2_val["attention_masks"]),
                np.asarray(X3_val["input_ids"]),
                np.asarray(X3_val["attention_masks"]),
            ),
        ),
    )

# Model Inference
- Learning Curve
- Cosine Similarity between Embeddings

## Learning Curve
- The model converges well, but the validation loss doesn't seem to be close to the training loss

In [None]:
plt.figure(figsize=(20, 8))
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "val"], loc="upper left")
plt.show()

## Cosine Similarity between Embeddings

<div class='alert alert-info'><strong>Note: </strong>The Cosine Similarity is a better metric than Euclidean distance for text similarity because if the two text document far apart by Euclidean distance, there are still chances that they are close to each other in terms of their context.</div>

<img src='https://developers.google.com/machine-learning/crash-course/images/linear-relationships.svg'>

In [None]:
def get_cosine_similarity(sentence1, sentence2):

    x1 = text_cleaning(sentence1)
    x1 = encode_text([x1], tokenizer)
    x2 = text_cleaning(sentence2)
    x2 = encode_text([x2], tokenizer)

    x1_inputs = np.array(x1["input_ids"])
    x1_masks = np.array(x1["attention_masks"])
    x2_inputs = np.array(x2["input_ids"])
    x2_masks = np.array(x2["attention_masks"])

    embeddings1 = embeddings([x1_inputs, x1_masks])
    embeddings2 = embeddings([x2_inputs, x2_masks])

    cosine_similarity = metrics.CosineSimilarity()

    return cosine_similarity(embeddings1, embeddings2).numpy()

## Sample Test Cases
- Cosine Similarity ranges from 0 to 1. 
- Value closer to 1 indicates higher similarity and a value closer to 0 indicates dissimilarity

In [None]:
sentence1 = "Is Earth circle in shape ?"
sentence2 = "Should I learn python as it is very popular ?"
get_cosine_similarity(sentence1, sentence2)

In [None]:
sentence1 = "Python is one of the most popular programming language out there"
sentence2 = "Should I learn python programming as it is very popular ?"
get_cosine_similarity(sentence1, sentence2)

In [None]:
sentence1 = "Which GPU gives a better performance NVIDIA or AMD ?"
sentence2 = "What is the recipe for Kolkata Chicken Roll?"
get_cosine_similarity(sentence1, sentence2)

In [None]:
sentence1 = "Which GPU gives a better performance NVIDIA or AMD ?"
sentence2 = "My friend has a NVIDIA GPU, and he suggests that it gives a very smooth gaming performance"
get_cosine_similarity(sentence1, sentence2)

In [None]:
sentence1 = "Which GPU gives a better performance NVIDIA or AMD ?"
sentence2 = "NVIDIA manufactures the best performing GPUS"
get_cosine_similarity(sentence1, sentence2)

<div class='alert alert-success'><strong>Conclusion:</strong>
    <li>Both the architectures seem to be performing well in terms of their inferences</li>
    <li>Original Siamese network can be evalutated in terms of classification metrics and similar sentences can be found using <strong>probability scores</strong></li>
    <li>Siamese network with triplet loss can be evaluated in terms of cosine distances and similar sentences can be found using <strong>higher cosine distances</strong></li>
    <li>For a real time scenario a metric like <strong>Hit Rate or User Engagement</strong> will prove to be more useful to get an infication of the usability of the model</li>
</div>