<a href="https://colab.research.google.com/github/Deminalla/Kursinis/blob/main/fta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install scikit-learn



In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

file_path = 'FTA_dataset.xlsx'
sheet1 = pd.read_excel(file_path, sheet_name='FTA', header=None)
sheet2 = pd.read_excel(file_path, sheet_name='NFTA', header=None)

sheet1.columns = ['paragraph']
sheet2.columns = ['paragraph']
sheet1['fta'] = 1
sheet2['fta'] = 0

data = pd.concat([sheet1, sheet2], ignore_index=True)

# Shuffle the data
# sample() - randomly select data
# frac=1 - select 100% of data
# After shuffling, the original index of the DataFrame is no longer in order
# reset_index(drop=True) is used to reset the index to a new sequential order.
# drop=True - discards the old index
data = data.sample(frac=1).reset_index(drop=True)

# training (70%), validation (10%), and test (20%)
train, temp = train_test_split(data, test_size=0.3, random_state=42)
val, test = train_test_split(temp, test_size=(2/3), random_state=42)

In [None]:
print("FTA sheet:")
print(sheet1[['paragraph', 'fta']].head()) # Display first 5 rows

print("\nNFTA sheet:")
print(sheet2[['paragraph', 'fta']].head())

print("\nData after combining and shuffling:")
print(data[['paragraph', 'fta']].head())

FTA sheet:
                                           paragraph  fta
0  Lietuvai nereikia Liberalų sąjūdžio, jos lyder...    1
1     palaukit, nenutraukit, korumpuotas žurnaliste.    1
2  Mes nesižarstome skaičiais, mes nežadame, Liet...    1
3  mokesčių sistema, kuri buvo per naktinę reform...    1
4                                             Vagys!    1

NFTA sheet:
                                           paragraph  fta
0  Mes, Lietuvos Laisvės sąjunga, esame pirmoji p...    0
1  Labas vakaras. Na, kad būtų žmogus laimingas, ...    0
2  Tai jeigu mes esame užsienio politikoj šiandie...    0
3  Na, daug yra kalbama apie laimės indeksą, bet ...    0
4  Kaip mūsų žmonės jausis Lietuvoje, koks bus po...    0

Data after combining and shuffling:
                                           paragraph  fta
0  Čia klausimas, kaip čia pasakyt, jis nueina į ...    0
1  Eikit sau, kaip man patinka, kaip jūs čia šnek...    1
2  Pirmiausia, mieli kolegos, klausimas yra tame:...    0
3  Monopoli

In [1]:
# Using huggingface libs
!pip install -q transformers
import numpy as np
from transformers import BertTokenizer, BertModel
from transformers import LongformerTokenizer, LongformerModel

In [11]:
# Prepare the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# doing the tokenization to prep data as tokens
preprocessed_data = []

for t in data['paragraph']:
  encoded_input = tokenizer(t, return_tensors='pt')
  preprocessed_data.append(encoded_input)

In [None]:
from tqdm import tqdm
# calculating last hidden states and everaging to have fixed dimmention features for each text
outputs = []

for inputs in tqdm(preprocessed_data, desc="Processing Data", unit="batch"):
    output = model(**inputs)
    features = np.mean(output[0].detach().numpy(), axis=1)
    outputs.append(features)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

# extracted features
outputs[0].shape, outputs[1].shape, outputs[2].shape

((1, 768), (1, 768), (1, 768))

In [15]:
#Now we can compare features
cosine_similarity(outputs[0], outputs[1])

array([[0.84704214]], dtype=float32)

In [16]:
cosine_similarity(outputs[0], outputs[2])

array([[0.881665]], dtype=float32)