In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.empty_cache()
print(device)

import string
import nltk
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

from transformers import AutoTokenizer, AutoModel

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

cpu




/kaggle/input/unit-segmentation-lstm-transformers/we.csv
/kaggle/input/unit-segmentation-lstm-transformers/mix1.csv
/kaggle/input/unit-segmentation-lstm-transformers/pe.csv
/kaggle/input/unit-segmentation-lstm-transformers/abam.csv
/kaggle/input/unit-segmentation-lstm-transformers/mix2.csv
/kaggle/input/unit-segmentation-lstm-transformers/ug.csv


# Glove

In [2]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2023-07-27 22:22:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-07-27 22:22:51--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-07-27 22:22:52--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [4]:
def clean_sentence(text, remove_stop = False):
    tokens = [token.strip().lower() for token in text.split() if not token in string.punctuation]
    if remove_stop:
        tokens = [token for token in tokens if not token in stops]
    return " ".join(tokens)

def get_sentence_embeddings_and_save(df, embeddings_index, embeddings_dim, output_path):
    sentences = list(df['clean_sentence'].values)
    embeddings = np.zeros((len(sentences), embeddings_dim))
    
    for i, sentence in tqdm(enumerate(sentences)):
        sentence_embedding = np.zeros((embeddings_dim))
#         tokens = nltk.word_tokenize(sentence)
        tokens = sentence.split()
        
        found_tokens = 0
        for token in tokens:
            token_embedding = embeddings_index.get(token)
            if token_embedding is not None:
                # we found the word - add that word's vector to the sentence embedding
                found_tokens += 1
                sentence_embedding = np.sum((sentence_embedding, token_embedding), axis = 0)

        # sentence embedding as average of token embeddings
        if found_tokens == 0:
            found_tokens = 1
        sentence_embedding = sentence_embedding / found_tokens

        # add the sentence embedding to the matrix at sentence position
        embeddings[i] = sentence_embedding
    
    # save csv file with embeddings, set and original labels
    emb_df = df.copy()
    emb_df['embeddings'] = embeddings.tolist()
    emb_df.to_csv(output_path, index = False)
    print(f"file saved {output_path}...")

# get glove embeddings from file
embeddings_index, embeddings_dim = {}, 300
with open('/kaggle/working/glove.6B.300d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print('Found {} word vectors.'.format(len(embeddings_index)))

for df_name in ['pe', 'we', 'abam', 'ug', 'mix1']:
    df_path = f"/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv"
    dataframe = pd.read_csv(df_path)
    
    dataframe['clean_sentence'] = dataframe['tokens'].apply(lambda x: clean_sentence(x))
    
    get_sentence_embeddings_and_save(dataframe, embeddings_index, embeddings_dim, f"{df_name}_glove.csv")

Found 400000 word vectors.


0it [00:00, ?it/s]

file saved pe_glove.csv...


0it [00:00, ?it/s]

file saved we_glove.csv...


0it [00:00, ?it/s]

file saved abam_glove.csv...


0it [00:00, ?it/s]

file saved ug_glove.csv...


0it [00:00, ?it/s]

file saved mix1_glove.csv...


## Sentence BERT

In [2]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0].cpu()
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def get_sentence_embeddings_and_save(df, model, output_path):
    
    model.eval()
    
    sentences = list(df['tokens'].values)
    
    batch_size, le = 500, len(df)
    embeddings = np.zeros((le, 384))
    
    for i in tqdm(range(0, le, batch_size)):
    
        batch_sentences = sentences[i:i+batch_size]

        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
        input_ids = encoded_input['input_ids'].to(device)
        token_type_ids = encoded_input['token_type_ids'].to(device)
        attention_mask = encoded_input['attention_mask'].to(device)
        with torch.no_grad():
            model_output = model(input_ids, token_type_ids = token_type_ids, attention_mask=attention_mask)

        pooling = mean_pooling(model_output, encoded_input['attention_mask'])

        embeddings[i:i+batch_size, :] = pooling.tolist()
        
    # save csv file with embeddings, set and original labels
    emb_df = df.copy()
    emb_df['embeddings'] = embeddings.tolist()
    emb_df.to_csv(output_path, index = False)
    print(f"file saved {output_path}...")

    
    
for df_name in ['pe', 'we', 'abam', 'ug', 'mix1']:
    df_path = f"/kaggle/input/unit-segmentation-lstm-transformers/{df_name}.csv"
    dataframe = pd.read_csv(df_path)
    
    #Load AutoModel from huggingface model repository
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    model.to(device)
    
    get_sentence_embeddings_and_save(dataframe, model, f"{df_name}_sbert.csv")
    

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  0%|          | 0/14 [00:00<?, ?it/s]

file saved pe_sbert.csv...


  0%|          | 0/24 [00:00<?, ?it/s]

file saved we_sbert.csv...


  0%|          | 0/9 [00:00<?, ?it/s]

file saved abam_sbert.csv...


  0%|          | 0/7 [00:00<?, ?it/s]

file saved ug_sbert.csv...


  0%|          | 0/27 [00:00<?, ?it/s]

file saved mix1_sbert.csv...
