In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from models import TfIdfEmbedder, CountVectorizerEmbedder

tqdm.pandas()

### 1. Prepare parlamint dataset

#### 1.1. Load Dataset (Sentence-Wise)

In [None]:
# Load parlamint dataset
df_parlamint = pd.read_csv("../../datasets/parlamint/parlamint-it-is-2022.txt", sep="\t").head(10000)
df_parlamint_subset = df_parlamint.copy(deep=True).head(100)
df_parlamint

#### 1.2. Group Dataset per Utterance

In [None]:
# Group sentence by utterance (=Parent_ID)
df_parlamint_grouped = (df_parlamint.groupby(["Parent_ID"])["Text"]
                        .apply(lambda s: " ".join(s))
                        .reset_index(name="utterance_text"))
print(f"Unique utterances: {df_parlamint_grouped.shape[0]}")
df_parlamint_grouped

In [None]:
sample_utterance = df_parlamint[df_parlamint["Parent_ID"] == "ParlaMint-IS_2022-01-17-20.u1"]["Text"]
sample_utterance

### 2. Different Text Embedding Algorithms

#### 2.1. Count Vectorizer (Sparse) Embeddings

In [None]:
# Adding the whole parlamint dataset as vocabulary
# cv_model = CountVectorizerEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=100, stop_words='english',
#                                    n_gram_range=(1, 3))

# Adding just the utterance sample as vocabulary
cv_model = CountVectorizerEmbedder(vocabulary=df_parlamint_grouped["utterance_text"], max_features=10,stop_words='english')

In [None]:
cv_embeddings = cv_model.embed(sample_utterance)
print(f"Number features: {len(cv_model.embedding_model.get_feature_names_out())}", cv_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {cv_embeddings.toarray().shape}")
df_cv_output = pd.DataFrame(columns=cv_model.embedding_model.get_feature_names_out(), data=cv_embeddings.toarray())
df_cv_output

#### 2.2 TF-IDF (Sparse) Embeddings

In [None]:
# Adding the whole parlamint dataset as vocabulary
# tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=100, stop_words='english')

# Adding just the utterance sample as vocabulary
tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint_grouped["utterance_text"], max_features=10, stop_words='english')

In [None]:
tfidf_embeddings = tfidf_model.embed(sample_utterance)
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {tfidf_embeddings.toarray().shape}")
df_tfidf_output = pd.DataFrame(columns=tfidf_model.embedding_model.get_feature_names_out(), data=tfidf_embeddings.toarray())
df_tfidf_output

#### 2.3 Sentence Transformer (Dense) Embeddings

In [None]:
st_model_small = SentenceTransformer('all-minilm-l6-v2')

In [None]:
# Encode sentence-wise
st_embeddings = st_model_small.encode(sample_utterance)
print(f"Number features: {len(st_embeddings)}")
print(f"Shape embedding array: {st_embeddings.shape}")
st_embeddings

In [None]:
# Encode utterance-wise
st_embeddings_u = st_model_small.encode(" ".join(sample_utterance))
print(f"Number features: {len(st_embeddings_u)}")
print(f"Shape embedding array: {st_embeddings_u.shape}")
st_embeddings_u

### 3. Encode whole Parlamint Dataset

#### 3.1 Encode with Sentence Transformer

In [None]:
# Encode utterance-wise dataset
df_parlamint_embeddings_per_utterance = st_model_small.encode(df_parlamint_grouped["utterance_text"].to_list(),
                                                     show_progress_bar=True)

# Encode sentence-wise dataset
df_parlamint_embeddings_per_sentence = st_model_small.encode(df_parlamint["Text"].to_list(), show_progress_bar=True)

In [None]:
df_parlamint_grouped["embedding"] = list(df_parlamint_embeddings_per_utterance)
df_parlamint_grouped

In [None]:
df_parlamint["embedding"] = list(df_parlamint_embeddings_per_sentence)
df_parlamint

#### 3.2 Save output to pickle file

In [None]:
df_parlamint.to_pickle("df_parlamint_all-MiniLM-L6-v2.pkl")

#### 3.3 Encode Dataset with TF-IDF

In [None]:
# Adding the whole parlamint dataset as vocabulary
tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint["Text"].to_list(), max_features=100, stop_words='english')

# Encode sentence-wise dataset
tfidf_embeddings_per_sentence = tfidf_model.embed(df_parlamint["Text"].to_list())

In [None]:
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {tfidf_embeddings_per_sentence.toarray().shape}")
tfidf_embeddings_per_sentence.toarray()

In [None]:
df_parlamint["embedding"] = list(tfidf_embeddings_per_sentence.toarray())
df_parlamint

#### 3.4 Save output to pickle file

In [None]:
df_parlamint.to_pickle("df_parlamint_all-tfidf.pkl")

#### 3.5 Load data from pickle file

In [None]:
df_read_parlamint = pd.read_pickle("<filename_path>.pkl")

### 4. Calculate similarities between embeddings

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

st_model_small = SentenceTransformer('all-minilm-l6-v2')

In [None]:
# 1. Example data
sentences = [
    "I deposited my paycheck at the bank yesterday.",
    "We had a picnic on the bank of the river.",
    "The financial institution announced a new savings account plan.",
    "She withdrew cash from the nearest ATM.",
    "The kids played near the riverbank after school."
]

query = "financial services"

# 2. Sentence Transformers embeddings
dense_embeddings_sentences = st_model_small.encode(sentences, convert_to_tensor=False)
dense_embeddings_query = st_model_small.encode([query], convert_to_tensor=False)
dense_similarities = util.cos_sim(dense_embeddings_query, dense_embeddings_sentences)[0].cpu().numpy()

# 3. TF-IDF embeddings
tfidf_model = TfIdfEmbedder(vocabulary=sentences, max_features=10, stop_words='english')
tfidf_embeddings_sentences = tfidf_model.embed(sentences)
tfidf_embeddings_query = tfidf_model.embed([query])
tfidf_similarities = cosine_similarity(tfidf_embeddings_query, tfidf_embeddings_sentences).flatten()

# 4. Compare rankings
df = pd.DataFrame({
    "sentence": sentences,
    "tfidf_similarity": tfidf_similarities,
    "st_similarity": dense_similarities
})
# df.sort_values(by=["st_similarity"], ascending=False, inplace=True)
df

### 5. How to build a Simple QA System

#### 5.1 Get the Most Likely Utterance

In [None]:
import numpy as np
from sentence_transformers import util

# Given question
question = "What is the government policy on climate change?"
# question = "What about president of america?"

# 1. Embed the question
question_embedding = st_model_small.encode(question)

# 2. Compute cosine similarities
cosine_similarities = util.cos_sim(question_embedding, df_parlamint["embedding"])[0].cpu().numpy()

# 3. Get the index of the most similar utterance
most_similar_idx = int(np.argmax(cosine_similarities))

# 4. Retrieve the most similar text
most_similar_text = df_parlamint.iloc[most_similar_idx]["Text"]
# most_similar_text
print(f"Score: {cosine_similarities[most_similar_idx]:.4f} | Utterance: {most_similar_text}\n")

In [None]:
df_parlamint["Text"]

#### 5.2 Get the Top-K relevant Utterances

In [None]:
question = "What is the government policy on climate change?"
# question = "America?"
k = 5  # choose how many results you want

# 1. Embed the question
question_embedding = st_model_small.encode(question)

# 2. Compute cosine similarities
cosine_similarities = util.cos_sim(question_embedding, df_parlamint["embedding"])[0].cpu().numpy()

# 3. Get indices of top-k most similar utterances
top_k_idx = np.argsort(cosine_similarities)[::-1][:k]

# 4. Retrieve the top-k utterances and their similarity scores
for idx in top_k_idx:
    text = df_parlamint.iloc[idx]["Text"]
    score = cosine_similarities[idx]
    print(f"Score: {score:.4f} | Utterance: {text}\n")


In [None]:
tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint["Text"], max_features=1000, stop_words='english')
tfidf_embeddings_sentences = tfidf_model.embed(df_parlamint["Text"].to_list())

In [None]:
df_parlamint["embedding"] = list(tfidf_embeddings_sentences.toarray())

In [None]:
question = "What is the government policy on climate change?"
# question = "America?"
k = 5  # choose how many results you want

# 1. Embed the question
#question_embedding = tfidf_model.encode(question)
question_embedding = tfidf_model.embed([question])

# 2. Compute cosine similarities
cosine_similarities = util.cos_sim(question_embedding.toarray(), df_parlamint["embedding"])[0].cpu().numpy()

# 3. Get indices of top-k most similar utterances
top_k_idx = np.argsort(cosine_similarities)[::-1][:k]

# 4. Retrieve the top-k utterances and their similarity scores
for idx in top_k_idx:
    text = df_parlamint.iloc[idx]["Text"]
    score = cosine_similarities[idx]
    print(f"Score: {score:.4f} | Utterance: {text}\n")


In [None]:
import numpy as np
np.where(question_embedding.toarray() > 0)


In [None]:
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {tfidf_embeddings.toarray().shape}")
df_tfidf_output = pd.DataFrame(columns=tfidf_model.embedding_model.get_feature_names_out(), data=tfidf_embeddings.toarray())