In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from models.models import TfIdfEmbedder, CountVectorizerEmbedder

tqdm.pandas()

In [3]:
df_parlamint = pd.read_csv("../materials/parlamint/parlamint-it-is-2022.txt", sep="\t")
df_parlamint_subset = df_parlamint.head(1000).copy(deep=True)
df_parlamint

Unnamed: 0,ID,Parent_ID,Text
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...
...,...,...,...
160540,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,I would say that we can consider the work of t...
160541,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"Of course, all good intentions about Parliamen..."
160542,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,"The motions, questions and questions received ..."
160543,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,ParlaMint-IT_2022-09-20-LEG18-Senato-sed-465.a...,The Senate is summoned to the house.


In [5]:
sample_utterance = df_parlamint[df_parlamint["Parent_ID"] == "ParlaMint-IS_2022-01-17-20.u1"]["Text"].to_list()
sample_utterance

['President of the United States reports:',
 'I have decided, according to the proposal of the prime minister, that the Council should meet for an extended meeting on Monday, January 17, 2022 p.m. 3:00.',
 'Arrange sites, January 11th, 2022.',
 "Katrín Jakobsdóttir's daughter.",
 'Presidential Letters for a meeting of the General Assembly for a subsequent meeting on January 17, 2022',
 "I'd like to use this opportunity here after reading this letter and offer the highest. President and w. Senators welcome to New Year's Parliamentary Conferences."]

In [12]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
# model = SentenceTransformer('all-minilm-l6-v2')

In [14]:
sample_utterance_embeddings = model.encode(sample_utterance, show_progress_bar=True)
sample_utterance_embeddings

Batches: 100%|██████████| 1/1 [00:00<00:00, 113.48it/s]


array([[ 0.00364317,  0.00757531, -0.01352414, ..., -0.01752406,
         0.00990045,  0.06021777],
       [-0.04760472, -0.06898866,  0.02710492, ..., -0.06230191,
        -0.1438545 ,  0.03971656],
       [-0.01698012, -0.04804515, -0.01744972, ..., -0.0390581 ,
        -0.07595797,  0.00352386],
       [-0.08331401, -0.04858722,  0.00143908, ...,  0.03839726,
         0.06088392, -0.01607142],
       [-0.0762682 , -0.0652778 ,  0.06610002, ...,  0.02295847,
        -0.10826013, -0.06591921],
       [ 0.01009848, -0.04537032,  0.0456478 , ..., -0.00065688,
        -0.08547055, -0.00425586]], shape=(6, 384), dtype=float32)

In [6]:
tfidf_model = TfIdfEmbedder(vocabulary=df_parlamint_subset["Text"].to_list(), min_df=1, stop_words='english')

In [7]:
sample_utterance_embeddings = tfidf_model.embed(sample_utterance)
print(tfidf_model.embedding_model.get_feature_names_out())
sample_utterance_embeddings.toarray()

['00' '000' '000k' ... 'ólason' 'þórsdóttir' 'þórunn']


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.35023074, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], shape=(6, 2666))

In [11]:
cv_model = CountVectorizerEmbedder(vocabulary=df_parlamint_subset["Text"].to_list(), min_df=5, n_gram_range=(1, 3))

In [12]:
sample_utterance_embeddings = cv_model.embed(sample_utterance)
print(cv_model.embedding_model.get_feature_names_out())
sample_utterance_embeddings.toarray()

['000' '15' '17' ... 'you senator' 'you very' 'you very much']


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(6, 1380))

In [31]:
def _apply_embeddings(row):
    sentence_embedding = model.encode(row["Text"], show_progress_bar=False)
    return sentence_embedding

# df_parlamint_subset["embedding"] = df_parlamint_subset.progress_apply(lambda e: _apply_embeddings(e), axis=1)
df_parlamint["embedding"] = df_parlamint.progress_apply(lambda e: _apply_embeddings(e), axis=1)

100%|██████████| 160545/160545 [10:35<00:00, 252.67it/s]


In [27]:
df_parlamint_subset.iloc[1]

ID                           ParlaMint-IS_2022-01-17-20.seg3.1
Parent_ID                        ParlaMint-IS_2022-01-17-20.u1
Text         I have decided, according to the proposal of t...
embedding    [-0.04760472, -0.068988696, 0.02710493, 0.0427...
Name: 1, dtype: object

In [34]:
# df_parlamint_subset.to_pickle("sample.pkl")
df_parlamint.to_pickle("df_parlamint.pkl")

In [29]:
df_read = pd.read_pickle("sample.pkl")
df_read

Unnamed: 0,ID,Parent_ID,Text,embedding
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:,"[0.0036431686, 0.0075752586, -0.0135241775, 0...."
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t...","[-0.04760472, -0.068988696, 0.02710493, 0.0427..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022.","[-0.016980091, -0.04804512, -0.017449707, 0.01..."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.,"[-0.083313994, -0.04858721, 0.0014390834, -0.0..."
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...,"[-0.07626818, -0.06527784, 0.0661, 0.010715253..."
...,...,...,...,...
995,ParlaMint-IS_2022-01-18-23.seg35.9,ParlaMint-IS_2022-01-18-23.u16,Not only is there an increase in the pay of he...,"[-0.041002072, 0.0683856, 0.042150404, 0.06849..."
996,ParlaMint-IS_2022-01-18-23.seg35.10,ParlaMint-IS_2022-01-18-23.u16,Improvements are urgent so that health care pr...,"[0.0063565983, 0.0662494, 0.083550535, -0.0025..."
997,ParlaMint-IS_2022-01-18-23.seg35.11,ParlaMint-IS_2022-01-18-23.u16,There are ways to increase the number of stude...,"[0.01226466, 0.026052875, 0.054193206, 0.00240..."
998,ParlaMint-IS_2022-01-18-23.seg35.12,ParlaMint-IS_2022-01-18-23.u16,The health system will not be improved unless ...,"[-0.006773294, 0.057199407, 0.014007235, -0.03..."
