In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from models.models import TfIdfEmbedder, CountVectorizerEmbedder

import pyarrow as pa
import pyarrow.parquet as pq

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


### 1. Prepare parlamint dataset

#### 1.1. Load Dataset (Sentence-Wise)

In [2]:
# Load parlamint dataset
df_parlamint = pd.read_csv("../materials/parlamint/parlamint-it-is-2022.txt", sep="\t")
df_parlamint_subset = df_parlamint.head(1000).copy(deep=True)
df_parlamint

Unnamed: 0,ID,Parent_ID,Text
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...
...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.


#### 1.2. Group Dataset per Utterance

In [3]:
# Group sentence by utterance (=Parent_ID)
df_parlamint_grouped = (df_parlamint.groupby(["Parent_ID"])["Text"]
                        .apply(lambda s: " ".join(s))
                        .reset_index(name="utterance_text"))
df_parlamint_grouped

Unnamed: 0,Parent_ID,utterance_text
0,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports: I have...
1,ParlaMint-IS_2022-01-17-20.u10,"Before the weekend, an article by Stefánssonar..."
2,ParlaMint-IS_2022-01-17-20.u11,"I read this decision in Perconte, which is not..."
3,ParlaMint-IS_2022-01-17-20.u12,"In fact, this is shown in the letter quoted by..."
4,ParlaMint-IS_2022-01-17-20.u13,"Yes, that's right. That's right. A senator who..."
...,...,...
13799,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Colleagues, Senator Verducci's speech concerns..."
13800,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Madam President, ladies and gentlemen, I would..."
13801,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Mr President, in my last speech last Tuesday I..."
13802,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Mr President, I wish to begin by expressing ou..."


In [4]:
sample_utterance = df_parlamint[df_parlamint["Parent_ID"] == "ParlaMint-IS_2022-01-17-20.u1"]["Text"]
sample_utterance

0              President of the United States reports:
1    I have decided, according to the proposal of t...
2                   Arrange sites, January 11th, 2022.
3                      Katrín Jakobsdóttir's daughter.
4    Presidential Letters for a meeting of the Gene...
5    I'd like to use this opportunity here after re...
Name: Text, dtype: object

### 2. Different Text Embedding Algorithms

#### 2.1. Count Vectorizer (Sparse) Embeddings

In [9]:
# Adding the whole parlamint dataset as vocabulary
# cv_model = CountVectorizerEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=100, stop_words='english',
#                                    n_gram_range=(1, 3))

# Adding just the utterance sample as vocabulary
cv_model = CountVectorizerEmbedder(vocabulary=sample_utterance, min_df=1, stop_words='english',
                                   n_gram_range=(1, 3))

In [11]:
cv_embeddings = cv_model.embed(sample_utterance)
print(f"Number features: {len(cv_model.embedding_model.get_feature_names_out())}", cv_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {cv_embeddings.toarray().shape}")
cv_embeddings.toarray()

Call 'transform' only...
Number features: 84 ['00' '11th' '11th 2022' '17' '17 2022' '2022' '2022 00' 'according'
 'according proposal' 'arrange' 'arrange sites' 'assembly'
 'assembly subsequent' 'conferences' 'council' 'council meet' 'daughter'
 'decided' 'decided according' 'extended' 'extended meeting' 'general'
 'general assembly' 'highest' 'highest president' 'jakobsdóttir'
 'jakobsdóttir daughter' 'january' 'january 11th' 'january 17' 'katrín'
 'katrín jakobsdóttir' 'letter' 'letter offer' 'letters' 'letters meeting'
 'like' 'like use' 'meet' 'meet extended' 'meeting' 'meeting general'
 'meeting january' 'meeting monday' 'minister' 'minister council' 'monday'
 'monday january' 'new' 'new year' 'offer' 'offer highest' 'opportunity'
 'opportunity reading' 'parliamentary' 'parliamentary conferences'
 'president' 'president senators' 'president united' 'presidential'
 'presidential letters' 'prime' 'prime minister' 'proposal'
 'proposal prime' 'reading' 'reading letter' 'reports' 'se

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

#### 2.2 TF-IDF (Sparse) Embeddings

In [12]:
# Adding the whole parlamint dataset as vocabulary
# tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=100, stop_words='english')

# Adding just the utterance sample as vocabulary
tfidf_model = TfIdfEmbedder(vocabulary=sample_utterance, min_df=1, stop_words='english')

In [14]:
tfidf_embeddings = tfidf_model.embed(sample_utterance)
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {tfidf_embeddings.toarray().shape}")
tfidf_embeddings.toarray()

Call 'transform' only...
Number features: 42 ['00' '11th' '17' '2022' 'according' 'arrange' 'assembly' 'conferences'
 'council' 'daughter' 'decided' 'extended' 'general' 'highest'
 'jakobsdóttir' 'january' 'katrín' 'letter' 'letters' 'like' 'meet'
 'meeting' 'minister' 'monday' 'new' 'offer' 'opportunity' 'parliamentary'
 'president' 'presidential' 'prime' 'proposal' 'reading' 'reports'
 'senators' 'sites' 'states' 'subsequent' 'united' 'use' 'welcome' 'year']
Shape embedding array: (6, 42)


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.42790272, 0.        ,
        0.        , 0.        , 0.        , 0.52182349, 0.        ,
        0.        , 0.52182349, 0.        , 0.52182349, 0.        ,
        0.        , 0.        ],
       [0.28509311, 0.        , 0.23378043, 0.1973735 , 0.28509311,
        0.        , 0.        , 0.        , 0.28509311, 0.        ,
        0.28509311, 0.28509311, 0.        , 0.        , 0.        ,
        0.1973735 , 0.        , 0.        , 0.        , 0.        ,
        0.28509311, 0.23378043, 0.28509311, 0.28509311, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.28509

#### 2.3 Sentence Transformer (Dense) Embeddings

In [15]:
st_model_small = SentenceTransformer('all-minilm-l6-v2')

In [16]:
# Encode sentence-wise
st_embeddings = st_model_small.encode(sample_utterance)
print(f"Number features: {len(st_embeddings)}")
print(f"Shape embedding array: {st_embeddings.shape}")
st_embeddings

Number features: 6
Shape embedding array: (6, 384)


array([[ 0.00364317,  0.00757531, -0.01352414, ..., -0.01752406,
         0.00990045,  0.06021777],
       [-0.04760472, -0.06898866,  0.02710492, ..., -0.06230191,
        -0.1438545 ,  0.03971656],
       [-0.01698012, -0.04804515, -0.01744972, ..., -0.0390581 ,
        -0.07595797,  0.00352386],
       [-0.08331401, -0.04858722,  0.00143908, ...,  0.03839726,
         0.06088392, -0.01607142],
       [-0.0762682 , -0.0652778 ,  0.06610002, ...,  0.02295847,
        -0.10826013, -0.06591921],
       [ 0.01009848, -0.04537032,  0.0456478 , ..., -0.00065688,
        -0.08547055, -0.00425586]], shape=(6, 384), dtype=float32)

In [17]:
# Encode utterance-wise
st_embeddings_u = st_model_small.encode(" ".join(sample_utterance))
print(f"Number features: {len(st_embeddings_u)}")
print(f"Shape embedding array: {st_embeddings_u.shape}")
st_embeddings_u

Number features: 384
Shape embedding array: (384,)


array([-5.19019924e-02, -1.02959543e-01,  6.31395578e-02,  2.98001859e-02,
       -2.51417179e-02, -9.66067985e-03, -7.38494247e-02, -2.23561581e-02,
       -6.58515021e-02,  7.95469154e-03, -6.14717193e-02,  9.11202375e-03,
       -7.41380826e-02, -8.91721901e-03,  4.29925509e-02,  3.98151651e-02,
        5.71934041e-04, -2.14750618e-02,  4.03534696e-02, -3.10853450e-03,
        4.81391288e-02,  3.13877836e-02,  2.43081208e-02,  3.66798649e-03,
       -4.10232283e-02, -1.00540500e-02, -1.73178706e-02, -3.86848561e-02,
       -1.10482974e-02,  6.46823570e-02,  5.51152304e-02, -5.29568596e-03,
        7.86908418e-02,  2.21064351e-02,  5.85365221e-02,  2.10544141e-03,
        5.57183661e-02,  3.54420915e-02,  3.88754085e-02, -5.69796823e-02,
       -1.24397436e-02, -6.79901764e-02,  2.59145983e-02, -4.62711742e-03,
       -3.93818580e-02,  4.40331697e-02, -3.18672806e-02, -2.18540430e-03,
       -3.43883373e-02,  6.45774752e-02,  4.68766131e-03,  1.60147704e-03,
        1.24297924e-02, -

### 3. Encode whole Parlamint Dataset

#### 3.1 Encode with Sentence Transformer

In [24]:
# Encode utterance-wise dataset
df_parlamint_embeddings_per_utterance = st_model_small.encode(df_parlamint_grouped["utterance_text"].to_list(),
                                                     show_progress_bar=True)

# Encode sentence-wise dataset
df_parlamint_embeddings_per_sentence = st_model_small.encode(df_parlamint["Text"].to_list(), show_progress_bar=True)

Batches: 100%|██████████| 432/432 [00:10<00:00, 39.39it/s] 
Batches: 100%|██████████| 5018/5018 [00:34<00:00, 146.19it/s]


In [22]:
df_parlamint_grouped["embedding"] = list(df_parlamint_embeddings_per_utterance)
df_parlamint_grouped

Unnamed: 0,Parent_ID,utterance_text,embedding
0,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports: I have...,"[-0.051902045, -0.1029595, 0.06313952, 0.02980..."
1,ParlaMint-IS_2022-01-17-20.u10,"Before the weekend, an article by Stefánssonar...","[-0.104611516, 0.06670187, -0.0565696, -0.0242..."
2,ParlaMint-IS_2022-01-17-20.u11,"I read this decision in Perconte, which is not...","[-0.003742227, 0.07299825, -0.017051963, -0.02..."
3,ParlaMint-IS_2022-01-17-20.u12,"In fact, this is shown in the letter quoted by...","[-0.10630381, 0.06314152, -0.014823501, 0.0141..."
4,ParlaMint-IS_2022-01-17-20.u13,"Yes, that's right. That's right. A senator who...","[-0.037640795, 0.1038077, -0.061545677, 0.0093..."
...,...,...,...
13799,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Colleagues, Senator Verducci's speech concerns...","[-0.019802073, 0.037965108, 0.08726237, 0.0270..."
13800,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Madam President, ladies and gentlemen, I would...","[-0.049527295, 0.014052414, 0.021675337, 0.010..."
13801,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Mr President, in my last speech last Tuesday I...","[-0.012908381, 0.027171515, 0.09444265, 0.0555..."
13802,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Mr President, I wish to begin by expressing ou...","[-0.00644074, 0.046314888, 0.06603416, -0.0122..."


In [23]:
df_parlamint["embedding"] = list(df_parlamint_embeddings_per_sentence)
df_parlamint

Unnamed: 0,ID,Parent_ID,Text,embedding
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:,"[0.003643172, 0.0075753126, -0.0135241905, 0.0..."
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t...","[-0.047604736, -0.06898866, 0.027104922, 0.042..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022.","[-0.01698018, -0.048045084, -0.017449742, 0.01..."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.,"[-0.08331399, -0.048587173, 0.00143911, -0.043..."
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...,"[-0.07626823, -0.06527784, 0.06610001, 0.01071..."
...,...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...,"[-0.067902915, 0.039009538, 0.062254358, -0.04..."
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen...","[0.016888414, 0.036268204, 0.04464855, -0.0834..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ...","[0.0019182335, 0.042933553, 0.044402953, -0.07..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.,"[0.009923743, -0.0020061715, 0.029129716, 0.01..."


#### 3.2 Save output to pickle file

In [None]:
df_parlamint.to_pickle("df_parlamint_all-MiniLM-L6-v2.pkl")

#### 3.3 Encode Dataset with TF-IDF

In [30]:
# Adding the whole parlamint dataset as vocabulary
tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=200, stop_words='english')

# Encode sentence-wise dataset
tfidf_embeddings_per_sentence = tfidf_model.embed(df_parlamint["Text"].to_list())

Call 'transform' only...


In [31]:
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {tfidf_embeddings_per_sentence.toarray().shape}")
tfidf_embeddings_per_sentence.toarray()

Number features: 1640 ['000' '10' '100' ... 'yes' 'yesterday' 'young']
Shape embedding array: (160545, 1640)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(160545, 1640))

In [33]:
df_parlamint["embedding"] = list(tfidf_embeddings_per_sentence.toarray())
df_parlamint

Unnamed: 0,ID,Parent_ID,Text,embedding
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022.","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


#### 3.4 Save output to pickle file

In [None]:
df_parlamint.to_pickle("df_parlamint_all-tfidf.pkl")

### 4. How to build a Simple QA System

#### 4.1 Get the Most Likely Utterance

In [56]:
import numpy as np
from sentence_transformers import util

# Given question
question = "What is the government policy on climate change?"
# question = "What about president of america?"

# 1. Embed the question
question_embedding = st_model_small.encode(question)

# 2. Compute cosine similarities
cosine_similarities = util.cos_sim(question_embedding, df_parlamint_grouped["embedding"])[0].cpu().numpy()

# 3. Get the index of the most similar utterance
most_similar_idx = int(np.argmax(cosine_similarities))

# 4. Retrieve the most similar text
most_similar_text = df_parlamint_grouped.iloc[most_similar_idx]["utterance_text"]
# most_similar_text
print(f"Score: {cosine_similarities[most_similar_idx]:.4f} | Utterance: {most_similar_text}\n")

Score: 0.5964 | Utterance: After listening to the highest. Minister, both today and yesterday, I get a little bit of the feeling that he looks at himself and his Ministry more like an observer than a doer when it comes to reducing greenhouse gas emissions. It is best that Ministers burn for more and larger activations, but, as the energy manager has noted, the energy flows directly into the energy exchange. It takes a very clear policy, but it needs a whole plan to make sure it does. That's why it hurts to the top. Ministers will not give us very clear answers on how Iceland's national target of climate change will be updated, when it will happen, and whether the government's climate programme of action will be reviewed and how these updated targets will appear in government policy and measures at all times. But I hope it reaches the highest. Minister to review it better afterwards. Last night we talked about the bus and the electric car truck. It turned out that the government still a

#### 4.2 Get the Top-K relevant Utterances

In [55]:
question = "What is the government policy on climate change?"
# question = "America?"
k = 5  # choose how many results you want

# 1. Embed the question
question_embedding = st_model_small.encode(question)

# 2. Compute cosine similarities
cosine_similarities = util.cos_sim(question_embedding, df_parlamint_embeddings_per_utterance)[0].cpu().numpy()

# 3. Get indices of top-k most similar utterances
top_k_idx = np.argsort(cosine_similarities)[::-1][:k]

# 4. Retrieve the top-k utterances and their similarity scores
for idx in top_k_idx:
    text = df_parlamint_grouped.iloc[idx]["utterance_text"]
    score = cosine_similarities[idx]
    print(f"Score: {score:.4f} | Utterance: {text}\n")


Score: 0.5964 | Utterance: After listening to the highest. Minister, both today and yesterday, I get a little bit of the feeling that he looks at himself and his Ministry more like an observer than a doer when it comes to reducing greenhouse gas emissions. It is best that Ministers burn for more and larger activations, but, as the energy manager has noted, the energy flows directly into the energy exchange. It takes a very clear policy, but it needs a whole plan to make sure it does. That's why it hurts to the top. Ministers will not give us very clear answers on how Iceland's national target of climate change will be updated, when it will happen, and whether the government's climate programme of action will be reviewed and how these updated targets will appear in government policy and measures at all times. But I hope it reaches the highest. Minister to review it better afterwards. Last night we talked about the bus and the electric car truck. It turned out that the government still a

In [43]:
cosine_similarities

tensor([0.1440, 0.0776, 0.2545,  ..., 0.2605, 0.2261, 0.3435])

#### ************************************************************************************************************************

In [5]:
print("Init 'paraphrase-multilingual-MiniLM-L12-v2'")
st_model_large = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
print("Init 'all-minilm-l6-v2'")
st_model_small = SentenceTransformer('all-minilm-l6-v2')
print("Init 'TfIdfEmbedder'")
tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=100, stop_words='english')
print("Init 'CountVectorizerEmbedder'")
cv_model = CountVectorizerEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=100, stop_words='english',
                                   n_gram_range=(1, 3))

Init 'paraphrase-multilingual-MiniLM-L12-v2'
Init 'all-minilm-l6-v2'
Init 'TfIdfEmbedder'
Init 'CountVectorizerEmbedder'


In [66]:
df_parlamint_embeddings_per_utterance = st_model_small.encode(df_parlamint_grouped["utterance_text"].to_list(),
                                                     show_progress_bar=True)
df_parlamint_embeddings_per_sentence = st_model_small.encode(df_parlamint["Text"].to_list(), show_progress_bar=True)

Batches: 100%|██████████| 432/432 [00:10<00:00, 40.30it/s] 
Batches: 100%|██████████| 5018/5018 [00:33<00:00, 148.42it/s]


In [19]:
df_parlamint_grouped["embedding"] = list(df_parlamint_embeddings_per_utterance)
df_parlamint["embedding"] = list(df_parlamint_embeddings_per_sentence)

In [68]:
table = pa.Table.from_pandas(df=df_parlamint, preserve_index=False)
pq.write_table(table, "df_parlamint_all-MiniLM-L6-v2.parquet", compression="zstd")
df_parlamint.to_pickle("df_parlamint_all-MiniLM-L6-v2.pkl")

# # Load
# df_loaded = pq.read_table("df_parlamint_all-MiniLM-L6-v2.parquet").to_pandas()

In [10]:
tfidf_embeddings = tfidf_model.embed(df_parlamint["Text"].to_list())
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {tfidf_embeddings.toarray().shape}")
tfidf_embeddings.toarray()

Call 'transform' only...
Number features: 2640 ['000' '10' '100' ... 'zero' 'ármannsson' 'ólafsson']
Shape embedding array: (160545, 2640)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(160545, 2640))

In [None]:
sample_utterance_embeddings = tfidf_model.embed(df_parlamint["Text"].to_list())
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {sample_utterance_embeddings.toarray().shape}")
sample_utterance_embeddings.toarray()

In [7]:
sample_utterance_embeddings = tfidf_model.embed(sample_utterance)
print(tfidf_model.embedding_model.get_feature_names_out())
sample_utterance_embeddings.toarray()

['00' '000' '000k' ... 'ólason' 'þórsdóttir' 'þórunn']


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.35023074, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], shape=(6, 2666))

In [9]:
sample_utterance_embeddings = cv_model.embed(sample_utterance)
print(cv_model.embedding_model.get_feature_names_out())
sample_utterance_embeddings.toarray()

['000' '15' '17' '2020' '2021' '2022' 'able' 'abroad' 'access' 'according'
 'account' 'action' 'actions' 'activities' 'actually' 'added' 'addition'
 'affairs' 'affected' 'afghanistan' 'ago' 'agree' 'agreement' 'ahead'
 'allow' 'answer' 'application' 'apply' 'approach' 'area' 'article' 'ask'
 'ask highest' 'ask highest minister' 'asking' 'assembly' 'assessment'
 'attention' 'authority' 'available' 'away' 'bad' 'based' 'basis'
 'beginning' 'believe' 'better' 'big' 'billion' 'blood' 'board' 'bring'
 'budget' 'business' 'business committee' 'businesses' 'calculated'
 'called' 'came' 'care' 'careful' 'carried' 'case' 'cases' 'certain'
 'certainly' 'change' 'changes' 'children' 'christmas' 'circumstances'
 'class' 'clear' 'clearly' 'come' 'comes' 'committee' 'community'
 'companies' 'company' 'compared' 'completed' 'completely' 'concerned'
 'condition' 'conditions' 'congress' 'consequences' 'consider'
 'considered' 'context' 'continuation' 'continue' 'contract' 'control'
 'cooperation' 'cope

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], shape=(6, 537))

In [13]:
def _apply_embeddings(row):
    sentence_embedding = model.encode(row["Text"], show_progress_bar=False)
    return sentence_embedding


# df_parlamint_subset["embedding"] = df_parlamint_subset.progress_apply(lambda e: _apply_embeddings(e), axis=1)
df_parlamint["embedding"] = df_parlamint.progress_apply(lambda e: _apply_embeddings(e), axis=1)

100%|██████████| 160545/160545 [10:28<00:00, 255.61it/s]


In [15]:
df_parlamint.iloc[1]

ID                           ParlaMint-IS_2022-01-17-20.seg3.1
Parent_ID                        ParlaMint-IS_2022-01-17-20.u1
Text         I have decided, according to the proposal of t...
embedding    [-0.04760472, -0.068988696, 0.02710493, 0.0427...
Name: 1, dtype: object

In [23]:
# df_parlamint_subset.to_pickle("sample.pkl")
# df_parlamint.to_pickle("df_parlamint.pkl")
df_parlamint.to_pickle("df_parlamint_all-MiniLM-L6-v2.pkl")

In [53]:
df_parlamint

Unnamed: 0,ID,Parent_ID,Text
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...
...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.


In [13]:
df_read = pd.read_pickle("./data/SENTENCE_TRANSFORMER_SMALL_PER_SENTENCE.pkl")
# df_read = pd.read_pickle("./data/SENTENCE_TRANSFORMER_SMALL_PER_UTTERANCE.pkl")
df_read

Unnamed: 0,ID,Parent_ID,Text,embedding
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:,"[0.003643172, 0.0075753126, -0.0135241905, 0.0..."
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t...","[-0.047604736, -0.06898866, 0.027104922, 0.042..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022.","[-0.01698018, -0.048045084, -0.017449742, 0.01..."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.,"[-0.08331399, -0.048587173, 0.00143911, -0.043..."
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...,"[-0.07626823, -0.06527784, 0.06610001, 0.01071..."
...,...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...,"[-0.067902915, 0.039009538, 0.062254358, -0.04..."
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen...","[0.016888414, 0.036268204, 0.04464855, -0.0834..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ...","[0.0019182335, 0.042933553, 0.044402953, -0.07..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.,"[0.009923743, -0.0020061715, 0.029129716, 0.01..."
