In [38]:
! pip install protobuf

Collecting protobuf
  Obtaining dependency information for protobuf from https://files.pythonhosted.org/packages/9c/4c/4563ebe001ff30dca9d7ed12e471fa098d9759712980cde1fd03a3a44fb7/protobuf-5.28.3-cp310-abi3-win_amd64.whl.metadata
  Downloading protobuf-5.28.3-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Downloading protobuf-5.28.3-cp310-abi3-win_amd64.whl (431 kB)
   ---------------------------------------- 0.0/431.5 kB ? eta -:--:--
   -- ------------------------------------- 30.7/431.5 kB ? eta -:--:--
   ------- -------------------------------- 81.9/431.5 kB 1.5 MB/s eta 0:00:01
   --------------- ------------------------ 163.8/431.5 kB 1.9 MB/s eta 0:00:01
   ------------------------------ --------- 327.7/431.5 kB 2.3 MB/s eta 0:00:01
   -------------------------------- ------- 348.2/431.5 kB 1.8 MB/s eta 0:00:01
   ---------------------------------------- 431.5/431.5 kB 2.1 MB/s eta 0:00:00
Installing collected packages: protobuf
Successfully installed protobuf-5.28.3


In [31]:
import torch
from transformers import BertTokenizer, BertModel , AutoTokenizer , AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

# Check if CUDA (GPU) is available
print("CUDA Available:", torch.cuda.is_available())

# Print the GPU name
if torch.cuda.is_available():
    print("GPU Device Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected. Using CPU instead.")

CUDA Available: True
GPU Device Name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [33]:
# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [35]:
# Initialize mBERT tokenizer and model
mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
mbert_model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)

In [40]:
indic_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
indic_model = AutoModel.from_pretrained("ai4bharat/indic-bert").to(device)

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

In [42]:
# Function to get embeddings for a list of sentences with tqdm progress bar
def mbert_get_embeddings(sentences):
    embeddings = []
    mbert_model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        for sentence in tqdm(sentences, desc="Generating embeddings", unit="sentence"):
            # Tokenize and prepare input tensors using encode_plus
            inputs = mbert_tokenizer.encode_plus(
                sentence,
                return_tensors='pt',
                truncation=True,
                padding='max_length',
                max_length=128
            )
            inputs = {key: value.to(device) for key, value in inputs.items()}

            # Get embeddings (using CLS token representation)
            outputs = mbert_model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embedding.flatten())

    return np.array(embeddings)

# Function to filter near-duplicate sentences based on cosine similarity with tqdm
def filter_duplicates(sentences, embeddings, threshold=0.90):
    filtered_sentences = []
    filtered_embeddings = []

    for i, embedding in tqdm(enumerate(embeddings), desc="Filtering duplicates", total=len(embeddings)):
        # Calculate cosine similarity with already accepted embeddings
        if len(filtered_embeddings) == 0:
            filtered_sentences.append(sentences[i])
            filtered_embeddings.append(embedding)
            continue

        similarities = cosine_similarity([embedding], filtered_embeddings)[0]
        max_similarity = max(similarities)

        # Only add the sentence if similarity is below the threshold
        if max_similarity < threshold:
            filtered_sentences.append(sentences[i])
            filtered_embeddings.append(embedding)

    return filtered_sentences

def indic_get_embeddings(sentences):
    embeddings = []
    indic_model.eval()
    with torch.no_grad():
        for sentence in tqdm(sentences, desc="Generating embeddings"):
            inputs = indic_tokenizer(
                sentence,
                return_tensors='pt',
                truncation=True,
                padding='max_length',
                max_length=128
            )
            inputs = {key: value.to(device) for key, value in inputs.items()}

            # Get embeddings and average them
            outputs = indic_model(**inputs)
            token_embeddings = outputs.last_hidden_state
            sentence_embedding = torch.mean(token_embeddings, dim=1).cpu().numpy().flatten()
            embeddings.append(sentence_embedding)

    return np.array(embeddings)


In [44]:
# Load your sentences (Updated column name to "Sentences")
data = pd.read_csv("syntactic_filtered_sentences.csv")
sentences = data['Sentences'].tolist()

In [46]:
# Get embeddings for the sentences with a progress bar
print("Generating mbert embeddings...")
mbert_embeddings = get_embeddings(sentences)

Generating mbert embeddings...


Generating embeddings: 100%|███████████████████████████████████████████████| 86519/86519 [21:09<00:00, 68.17sentence/s]


In [47]:
# Get embeddings for the sentences with a progress bar
print("Generating indic embeddings...")
indic_embeddings = indic_get_embeddings(sentences)

Generating indic embeddings...


Generating embeddings: 100%|█████████████████████████████████████████████████████| 86519/86519 [39:58<00:00, 36.07it/s]


In [83]:
# Filter out near-duplicate sentences with a progress bar
print("(mbert)Filtering near-duplicate sentences...")
mbert_filtered_sentences = filter_duplicates(sentences, mbert_embeddings, threshold=0.97)

(mbert)Filtering near-duplicate sentences...


Filtering duplicates: 100%|████████████████████████████████████████████████████| 86519/86519 [2:27:39<00:00,  9.77it/s]


In [72]:
# Filter out near-duplicate sentences with a progress bar
print("(indic)Filtering near-duplicate sentences...")
indic_filtered_sentences = filter_duplicates(sentences, mbert_embeddings, threshold=0.96)

(indic)Filtering near-duplicate sentences...


Filtering duplicates: 100%|████████████████████████████████████████████████████| 86519/86519 [1:24:18<00:00, 17.10it/s]


In [84]:
# Output the results
print(f"Original number of sentences: {len(sentences)}")
print(f"(indic embedding) Number of unique sentences after filtering: {len(indic_filtered_sentences)}")
print(f"(Mbert embedding) Number of unique sentences after filtering: {len(mbert_filtered_sentences)}")

Original number of sentences: 86519
(indic embedding) Number of unique sentences after filtering: 26011
(Mbert embedding) Number of unique sentences after filtering: 46189


In [85]:
# Save the filtered sentences to a CSV file
mbert_filtered_df = pd.DataFrame(mbert_filtered_sentences, columns=["Sentences"])
mbert_filtered_df.to_csv("97%_mbert_filtered_sentences.csv", index=False)
print("Filtered sentences saved to 'filtered_sentences.csv'.")

Filtered sentences saved to 'filtered_sentences.csv'.


In [75]:
# Save the filtered sentences to a CSV file
indic_filtered_df = pd.DataFrame(indic_filtered_sentences, columns=["Sentences"])
indic_filtered_df.to_csv("indic_filtered_sentences.csv", index=False)
print("Filtered sentences saved to 'filtered_sentences.csv'.")

Filtered sentences saved to 'filtered_sentences.csv'.
