In [8]:
import torch

# Check if CUDA (GPU) is available
print("CUDA Available:", torch.cuda.is_available())

# Print the GPU name
if torch.cuda.is_available():
    print("GPU Device Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected. Using CPU instead.")


CUDA Available: True
GPU Device Name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [5]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from tqdm import tqdm

# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Initialize mBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)

# Function to get embeddings for a list of sentences with tqdm progress bar
def get_embeddings(sentences):
    embeddings = []
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        for sentence in tqdm(sentences, desc="Generating embeddings", unit="sentence"):
            # Tokenize and prepare input tensors
            inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
            inputs = {key: value.to(device) for key, value in inputs.items()}

            # Get embeddings (using CLS token representation)
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embedding.flatten())

    return np.array(embeddings)

# Function to filter near-duplicate sentences based on cosine similarity with tqdm
def filter_duplicates(sentences, embeddings, threshold=0.75):
    filtered_sentences = []
    filtered_embeddings = []

    for i, embedding in tqdm(enumerate(embeddings), desc="Filtering duplicates", total=len(embeddings)):
        # Calculate cosine similarity with already accepted embeddings
        if len(filtered_embeddings) == 0:
            filtered_sentences.append(sentences[i])
            filtered_embeddings.append(embedding)
            continue

        similarities = cosine_similarity([embedding], filtered_embeddings)[0]
        max_similarity = max(similarities)

        # Only add the sentence if similarity is below the threshold
        if max_similarity < threshold:
            filtered_sentences.append(sentences[i])
            filtered_embeddings.append(embedding)

    return filtered_sentences

# Load your sentences (Assuming a CSV file with a column named 'sentence')
data = pd.read_csv("syntactic_filtered_sentences.csv")
sentences = data['Sentence'].tolist()

# Get embeddings for the sentences with a progress bar
print("Generating embeddings...")
embeddings = get_embeddings(sentences)

# Filter out near-duplicate sentences with a progress bar
print("Filtering near-duplicate sentences...")
filtered_sentences = filter_duplicates(sentences, embeddings, threshold=0.75)

# Output the results
print(f"Original number of sentences: {len(sentences)}")
print(f"Number of unique sentences after filtering: {len(filtered_sentences)}")

# Save the filtered sentences to a CSV file
filtered_df = pd.DataFrame(filtered_sentences, columns=["sentence"])
filtered_df.to_csv("filtered_sentences.csv", index=False)
print("Filtered sentences saved to 'filtered_sentences.csv'.")


Using device: cuda


100%|███████████████████████████████████████████████████████████████████████| 995526/995526 [00:04<00:00, 215934.30B/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:00<?, ?B/s]
100%|█████████████████████████████████████████████████████████████████| 714314041/714314041 [28:36<00:00, 416029.57B/s]
  state_dict = torch.load(resolved_archive_file, map_location='cpu')


KeyError: 'Sentence'