In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from models.models import TfIdfEmbedder, CountVectorizerEmbedder

import pyarrow as pa
import pyarrow.parquet as pq

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load parlamint dataset
df_parlamint = pd.read_csv("../materials/parlamint/parlamint-it-is-2022.txt", sep="\t")
df_parlamint_subset = df_parlamint.head(1000).copy(deep=True)
df_parlamint

Unnamed: 0,ID,Parent_ID,Text
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...
...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.


In [3]:
# Group sentence by utterance (=Parent_ID)
df_parlamint_grouped = (df_parlamint.groupby(["Parent_ID"])["Text"]
                        .apply(lambda s: " ".join(s))
                        .reset_index(name="utterance_text"))
df_parlamint_grouped

Unnamed: 0,Parent_ID,utterance_text
0,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports: I have...
1,ParlaMint-IS_2022-01-17-20.u10,"Before the weekend, an article by Stefánssonar..."
2,ParlaMint-IS_2022-01-17-20.u11,"I read this decision in Perconte, which is not..."
3,ParlaMint-IS_2022-01-17-20.u12,"In fact, this is shown in the letter quoted by..."
4,ParlaMint-IS_2022-01-17-20.u13,"Yes, that's right. That's right. A senator who..."
...,...,...
13799,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Colleagues, Senator Verducci's speech concerns..."
13800,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Madam President, ladies and gentlemen, I would..."
13801,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Mr President, in my last speech last Tuesday I..."
13802,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Mr President, I wish to begin by expressing ou..."


In [4]:
sample_utterance = df_parlamint[df_parlamint["Parent_ID"] == "ParlaMint-IS_2022-01-17-20.u1"]["Text"]
sample_utterance

0              President of the United States reports:
1    I have decided, according to the proposal of t...
2                   Arrange sites, January 11th, 2022.
3                      Katrín Jakobsdóttir's daughter.
4    Presidential Letters for a meeting of the Gene...
5    I'd like to use this opportunity here after re...
Name: Text, dtype: object

In [5]:
print("Init 'paraphrase-multilingual-MiniLM-L12-v2'")
st_model_large = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
print("Init 'all-minilm-l6-v2'")
st_model_small = SentenceTransformer('all-minilm-l6-v2')
print("Init 'TfIdfEmbedder'")
tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=100, stop_words='english')
print("Init 'CountVectorizerEmbedder'")
cv_model = CountVectorizerEmbedder(vocabulary=df_parlamint["Text"].to_list(), min_df=100, stop_words='english',
                                   n_gram_range=(1, 3))

Init 'paraphrase-multilingual-MiniLM-L12-v2'
Init 'all-minilm-l6-v2'
Init 'TfIdfEmbedder'
Init 'CountVectorizerEmbedder'


In [66]:
df_parlamint_embeddings_per_utterance = st_model_small.encode(df_parlamint_grouped["utterance_text"].to_list(),
                                                     show_progress_bar=True)
df_parlamint_embeddings_per_sentence = st_model_small.encode(df_parlamint["Text"].to_list(), show_progress_bar=True)

Batches: 100%|██████████| 432/432 [00:10<00:00, 40.30it/s] 
Batches: 100%|██████████| 5018/5018 [00:33<00:00, 148.42it/s]


In [67]:
df_parlamint_grouped["embedding"] = list(df_parlamint_embeddings_per_utterance)
df_parlamint["embedding"] = list(df_parlamint_embeddings_per_sentence)

In [68]:
table = pa.Table.from_pandas(df=df_parlamint, preserve_index=False)
pq.write_table(table, "df_parlamint_all-MiniLM-L6-v2.parquet", compression="zstd")
df_parlamint.to_pickle("df_parlamint_all-MiniLM-L6-v2.pkl")

# # Load
# df_loaded = pq.read_table("df_parlamint_all-MiniLM-L6-v2.parquet").to_pandas()

In [10]:
tfidf_embeddings = tfidf_model.embed(df_parlamint["Text"].to_list())
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {tfidf_embeddings.toarray().shape}")
tfidf_embeddings.toarray()

Call 'transform' only...
Number features: 2640 ['000' '10' '100' ... 'zero' 'ármannsson' 'ólafsson']
Shape embedding array: (160545, 2640)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(160545, 2640))

In [None]:
sample_utterance_embeddings = tfidf_model.embed(df_parlamint["Text"].to_list())
print(f"Number features: {len(tfidf_model.embedding_model.get_feature_names_out())}", tfidf_model.embedding_model.get_feature_names_out())
print(f"Shape embedding array: {sample_utterance_embeddings.toarray().shape}")
sample_utterance_embeddings.toarray()

In [7]:
sample_utterance_embeddings = tfidf_model.embed(sample_utterance)
print(tfidf_model.embedding_model.get_feature_names_out())
sample_utterance_embeddings.toarray()

['00' '000' '000k' ... 'ólason' 'þórsdóttir' 'þórunn']


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.35023074, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], shape=(6, 2666))

In [9]:
sample_utterance_embeddings = cv_model.embed(sample_utterance)
print(cv_model.embedding_model.get_feature_names_out())
sample_utterance_embeddings.toarray()

['000' '15' '17' '2020' '2021' '2022' 'able' 'abroad' 'access' 'according'
 'account' 'action' 'actions' 'activities' 'actually' 'added' 'addition'
 'affairs' 'affected' 'afghanistan' 'ago' 'agree' 'agreement' 'ahead'
 'allow' 'answer' 'application' 'apply' 'approach' 'area' 'article' 'ask'
 'ask highest' 'ask highest minister' 'asking' 'assembly' 'assessment'
 'attention' 'authority' 'available' 'away' 'bad' 'based' 'basis'
 'beginning' 'believe' 'better' 'big' 'billion' 'blood' 'board' 'bring'
 'budget' 'business' 'business committee' 'businesses' 'calculated'
 'called' 'came' 'care' 'careful' 'carried' 'case' 'cases' 'certain'
 'certainly' 'change' 'changes' 'children' 'christmas' 'circumstances'
 'class' 'clear' 'clearly' 'come' 'comes' 'committee' 'community'
 'companies' 'company' 'compared' 'completed' 'completely' 'concerned'
 'condition' 'conditions' 'congress' 'consequences' 'consider'
 'considered' 'context' 'continuation' 'continue' 'contract' 'control'
 'cooperation' 'cope

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], shape=(6, 537))

In [13]:
def _apply_embeddings(row):
    sentence_embedding = model.encode(row["Text"], show_progress_bar=False)
    return sentence_embedding


# df_parlamint_subset["embedding"] = df_parlamint_subset.progress_apply(lambda e: _apply_embeddings(e), axis=1)
df_parlamint["embedding"] = df_parlamint.progress_apply(lambda e: _apply_embeddings(e), axis=1)

100%|██████████| 160545/160545 [10:28<00:00, 255.61it/s]


In [15]:
df_parlamint.iloc[1]

ID                           ParlaMint-IS_2022-01-17-20.seg3.1
Parent_ID                        ParlaMint-IS_2022-01-17-20.u1
Text         I have decided, according to the proposal of t...
embedding    [-0.04760472, -0.068988696, 0.02710493, 0.0427...
Name: 1, dtype: object

In [23]:
# df_parlamint_subset.to_pickle("sample.pkl")
# df_parlamint.to_pickle("df_parlamint.pkl")
df_parlamint.to_pickle("df_parlamint_all-MiniLM-L6-v2.pkl")

In [53]:
df_parlamint

Unnamed: 0,ID,Parent_ID,Text
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...
...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.


In [13]:
df_read = pd.read_pickle("./data/SENTENCE_TRANSFORMER_SMALL_PER_SENTENCE.pkl")
# df_read = pd.read_pickle("./data/SENTENCE_TRANSFORMER_SMALL_PER_UTTERANCE.pkl")
df_read

Unnamed: 0,ID,Parent_ID,Text,embedding
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:,"[0.003643172, 0.0075753126, -0.0135241905, 0.0..."
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t...","[-0.047604736, -0.06898866, 0.027104922, 0.042..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022.","[-0.01698018, -0.048045084, -0.017449742, 0.01..."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.,"[-0.08331399, -0.048587173, 0.00143911, -0.043..."
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...,"[-0.07626823, -0.06527784, 0.06610001, 0.01071..."
...,...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...,"[-0.067902915, 0.039009538, 0.062254358, -0.04..."
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen...","[0.016888414, 0.036268204, 0.04464855, -0.0834..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ...","[0.0019182335, 0.042933553, 0.044402953, -0.07..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.,"[0.009923743, -0.0020061715, 0.029129716, 0.01..."
