In [8]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

# Check if CUDA (GPU) is available
print("CUDA Available:", torch.cuda.is_available())

# Print the GPU name
if torch.cuda.is_available():
    print("GPU Device Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected. Using CPU instead.")


CUDA Available: True
GPU Device Name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [25]:
!pip install fsspec==2023.3.0 --force-reinstall

Collecting fsspec==2023.3.0
  Obtaining dependency information for fsspec==2023.3.0 from https://files.pythonhosted.org/packages/4f/65/887925f1549fcb6ac3abb23a747c10f5ab083e8471fe568768b18bdb15b2/fsspec-2023.3.0-py3-none-any.whl.metadata
  Using cached fsspec-2023.3.0-py3-none-any.whl.metadata (5.5 kB)
Using cached fsspec-2023.3.0-py3-none-any.whl (145 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.10.0
    Uninstalling fsspec-2024.10.0:
      Successfully uninstalled fsspec-2024.10.0
Successfully installed fsspec-2023.3.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
huggingface-hub 0.26.2 requires fsspec>=2023.5.0, but you have fsspec 2023.3.0 which is incompatible.


In [27]:
!pip install --upgrade transformers

Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)
  Obtaining dependency information for fsspec>=2023.5.0 from https://files.pythonhosted.org/packages/c6/b2/454d6e7f0158951d8a78c2e1eb4f69ae81beb8dca5fee9809c6c99e9d0d0/fsspec-2024.10.0-py3-none-any.whl.metadata
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2024.10.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.3.0
    Uninstalling fsspec-2023.3.0:
      Successfully uninstalled fsspec-2023.3.0
Successfully installed fsspec-2024.10.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
s3fs 2023.3.0 requires fsspec==2023.3.0, but you have fsspec 2024.10.0 which is incompatible.


In [29]:
# Initialize mBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)

# Function to get embeddings for a list of sentences with tqdm progress bar
def get_embeddings(sentences):
    embeddings = []
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        for sentence in tqdm(sentences, desc="Generating embeddings", unit="sentence"):
            # Tokenize and prepare input tensors using encode_plus
            inputs = tokenizer.encode_plus(
                sentence,
                return_tensors='pt',
                truncation=True,
                padding='max_length',
                max_length=128
            )
            inputs = {key: value.to(device) for key, value in inputs.items()}

            # Get embeddings (using CLS token representation)
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embedding.flatten())

    return np.array(embeddings)

# Function to filter near-duplicate sentences based on cosine similarity with tqdm
def filter_duplicates(sentences, embeddings, threshold=0.75):
    filtered_sentences = []
    filtered_embeddings = []

    for i, embedding in tqdm(enumerate(embeddings), desc="Filtering duplicates", total=len(embeddings)):
        # Calculate cosine similarity with already accepted embeddings
        if len(filtered_embeddings) == 0:
            filtered_sentences.append(sentences[i])
            filtered_embeddings.append(embedding)
            continue

        similarities = cosine_similarity([embedding], filtered_embeddings)[0]
        max_similarity = max(similarities)

        # Only add the sentence if similarity is below the threshold
        if max_similarity < threshold:
            filtered_sentences.append(sentences[i])
            filtered_embeddings.append(embedding)

    return filtered_sentences

# Load your sentences (Updated column name to "Sentences")
data = pd.read_csv("syntactic_filtered_sentences.csv")
sentences = data['Sentences'].tolist()

# Get embeddings for the sentences with a progress bar
print("Generating embeddings...")
embeddings = get_embeddings(sentences)

# Filter out near-duplicate sentences with a progress bar
print("Filtering near-duplicate sentences...")
filtered_sentences = filter_duplicates(sentences, embeddings, threshold=0.75)

# Output the results
print(f"Original number of sentences: {len(sentences)}")
print(f"Number of unique sentences after filtering: {len(filtered_sentences)}")

# Save the filtered sentences to a CSV file
filtered_df = pd.DataFrame(filtered_sentences, columns=["Sentences"])
filtered_df.to_csv("filtered_sentences.csv", index=False)
print("Filtered sentences saved to 'filtered_sentences.csv'.")


Using device: cuda


  state_dict = torch.load(resolved_archive_file, map_location='cpu')


Generating embeddings...


Generating embeddings:   0%|                                                           | 0/86519 [00:00<?, ?sentence/s]


TypeError: BertTokenizer._tokenize() got an unexpected keyword argument 'truncation'