Run this file on Kaggle using GPU T4 x2

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aclimdb/train.csv
/kaggle/input/aclimdb/test.csv


In [2]:
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer
import pyarrow as pa
import pyarrow.parquet as pq

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.to(device)  # Move model to GPU

# Define a function for text preprocessing and obtaining BERT embeddings
def process_and_get_embedding(text):
    # Text preprocessing steps (you can customize this based on your requirements)
    text = text.lower()  # Convert text to lowercase
    # Other preprocessing steps such as removing punctuation, stopwords, etc. can be added here
    
    # Tokenize input sentence
    tokens = tokenizer.tokenize(text)
    # Truncate or split long sentences
    max_input_length = tokenizer.max_model_input_sizes[model_name]
    tokens = tokens[:max_input_length - 2]  # Account for [CLS] and [SEP] tokens
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    # Convert tokens to PyTorch tensors and move to GPU
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    tokens_tensor = torch.tensor([tokens_ids]).to(device)
    # Get BERT model output
    with torch.no_grad():
        outputs = model(tokens_tensor)
        # Extract the output of the [CLS] token from the last layer
        sentence_embedding = outputs.last_hidden_state[:, 0, :]
    return sentence_embedding.cpu().numpy().tolist()  # Move tensor back to CPU and convert to list

# Example dataframes (replace these with your actual train and test dataframes)
train_df = pd.read_csv('/kaggle/input/aclimdb/train.csv')
test_df = pd.read_csv('/kaggle/input/aclimdb/test.csv')

# Apply the function to the review column of both dataframes
train_df['embedding'] = train_df['review'].apply(lambda x: process_and_get_embedding(x))
test_df['embedding'] = test_df['review'].apply(lambda x: process_and_get_embedding(x))

# Save dataframes to Parquet files
pq.write_table(pa.Table.from_pandas(train_df), 'train_embeddings.parquet')
pq.write_table(pa.Table.from_pandas(test_df), 'test_embeddings.parquet')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]