#Transformer from Scratch


In [None]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import copy

In [None]:
# Attention Layer
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [None]:
# Feed forward layer
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [None]:
# Position Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [None]:
# The encoder layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [None]:
# Decoder layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [None]:
# Whole transformer with encoder and decoder
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [None]:
# Testing with some random data
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

In [None]:
# Training the transformer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(100):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 8.688511848449707
Epoch: 2, Loss: 8.555490493774414
Epoch: 3, Loss: 8.482667922973633
Epoch: 4, Loss: 8.426941871643066
Epoch: 5, Loss: 8.36954116821289
Epoch: 6, Loss: 8.304852485656738
Epoch: 7, Loss: 8.221444129943848
Epoch: 8, Loss: 8.141271591186523
Epoch: 9, Loss: 8.061848640441895
Epoch: 10, Loss: 7.981379985809326
Epoch: 11, Loss: 7.900750160217285
Epoch: 12, Loss: 7.818185329437256
Epoch: 13, Loss: 7.735236167907715
Epoch: 14, Loss: 7.653522968292236
Epoch: 15, Loss: 7.567119121551514
Epoch: 16, Loss: 7.486506938934326
Epoch: 17, Loss: 7.399098873138428
Epoch: 18, Loss: 7.320925235748291
Epoch: 19, Loss: 7.233945369720459
Epoch: 20, Loss: 7.158273220062256
Epoch: 21, Loss: 7.084658622741699
Epoch: 22, Loss: 7.00965690612793
Epoch: 23, Loss: 6.917025566101074
Epoch: 24, Loss: 6.841955184936523
Epoch: 25, Loss: 6.7698073387146
Epoch: 26, Loss: 6.697391986846924
Epoch: 27, Loss: 6.630137920379639
Epoch: 28, Loss: 6.553027629852295
Epoch: 29, Loss: 6.47477912902832

# Siamese Model

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('sample_data.csv')
df.head()
df['sentences1']

0      What is the step by step guide to invest in sh...
1      What is the story of Kohinoor (Koh-i-Noor) Dia...
2      How can I increase the speed of my internet co...
3      Why am I mentally very lonely? How can I solve...
4      Which one dissolve in water quikly sugar, salt...
                             ...                        
494    Which country is best for higher education and...
495                  What is the painting on this image?
496    Which are the major highways in California and...
497                          What's beyond our Universe?
498    Is growing of hair a physical or a chemical ch...
Name: sentences1, Length: 499, dtype: object

In [None]:
def load_data(df):
    text1 = df['sentences1'].astype(str).values
    text2 = df['sentences2'].astype(str).values
    labels = df['is_similar'].values
    return text1, text2, labels


In [None]:
df = pd.read_csv('sample_data.csv')
text1, text2, labels = load_data(df)

In [None]:
def cleanAscii(text):
       return ''.join(i for i in text if ord(i) < 128)

In [None]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(text1 + text2)
# converting them to sequencies of numerical tokens
sequences1 = tokenizer.texts_to_sequences(text1)
sequences2 = tokenizer.texts_to_sequences(text2)

# padding
max_sequence_length = 300
sequences1 = pad_sequences(sequences1, maxlen=max_sequence_length, padding='post')
sequences2 = pad_sequences(sequences2, maxlen=max_sequence_length, padding='post')

# Divinding into input and output
X = [sequences1, sequences2]
y = labels

# Split the data for training and testing
X_train1, X_test1, X_train2, X_test2, y_train, y_test = train_test_split(sequences1, sequences2, y, test_size=0.2, random_state=42)

In [None]:
# defining the model
embedding_dim = 100
lstm_units = 64

input1 = tf.keras.Input(shape=(max_sequence_length,))
input2 = tf.keras.Input(shape=(max_sequence_length,))

emb = tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_sequence_length, trainable=False)
lstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2))

e1 = emb(input1)
x1 = lstm_layer(e1)
e2 = emb(input2)
x2 = lstm_layer(e2)

mhd = lambda x: tf.keras.backend.abs(x[0] - x[1])
merged = tf.keras.layers.Lambda(function=mhd, output_shape=lambda x: x[0], name='L1_distance')([x1, x2])
preds = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

model = tf.keras.Model(inputs=[input1, input2], outputs=preds)
model.compile(loss='mse', optimizer='adam')



In [None]:
history = model.fit([X_train1, X_train2], y_train, epochs=25, validation_split=0.2, batch_size=32)

# Evaluate the model on the test set
test_loss = model.evaluate([X_test1, X_test2], y_test)
print("Test Loss:", test_loss)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Test Loss: 0.21214275062084198


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

# Assuming X_test1 and X_test2 are your test data
# Make predictions
y_pred = model.predict([X_test1, X_test2])
# print(y_pred)
# Assuming y_test is your ground truth labels
# Convert the predictions to binary values (0 or 1)
y_pred_binary = (y_pred > 0.4).astype(int)

# Reshape y_test if needed
# y_test = y_test.reshape(y_pred.shape)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred_binary)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred_binary)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, y_pred_binary)
print("F1 Score:", f1)


conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.52
Precision: 0.37681159420289856
Recall: 0.8387096774193549
F1 Score: 0.52
Confusion Matrix:
[[26 43]
 [ 5 26]]


# Siamese using Glove embeddings

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import os

def load_data(df):
    text1 = df['sentences1'].astype(str).values
    text2 = df['sentences2'].astype(str).values
    labels = df['is_similar'].values
    return text1, text2, labels

df = pd.read_csv('sample_data.csv')

text1, text2, labels = load_data(df)

max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(text1 + text2)

sequences1 = tokenizer.texts_to_sequences(text1)
sequences2 = tokenizer.texts_to_sequences(text2)

max_sequence_length = 300
sequences1 = pad_sequences(sequences1, maxlen=max_sequence_length, padding='post')
sequences2 = pad_sequences(sequences2, maxlen=max_sequence_length, padding='post')

X = [sequences1, sequences2]
y = labels

X_train1, X_test1, X_train2, X_test2, y_train, y_test = train_test_split(sequences1, sequences2, y, test_size=0.2, random_state=42)

glove_dir = '/content/'
embeddings_index = {}

with open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

embedding_dim = 100
lstm_units = 64

input1 = tf.keras.Input(shape=(max_sequence_length,))
input2 = tf.keras.Input(shape=(max_sequence_length,))

emb = tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_sequence_length, weights=[embedding_matrix], trainable=False)
lstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2))

e1 = emb(input1)
x1 = lstm_layer(e1)
e2 = emb(input2)
x2 = lstm_layer(e2)

mhd = lambda x: tf.keras.backend.abs(x[0] - x[1])
merged = tf.keras.layers.Lambda(function=mhd, output_shape=lambda x: x[0], name='L1_distance')([x1, x2])
preds = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

model = tf.keras.Model(inputs=[input1, input2], outputs=preds)
model.compile(loss='mse', optimizer='adam')

history = model.fit([X_train1, X_train2], y_train, epochs=25, validation_split=0.2, batch_size=32)

test_loss = model.evaluate([X_test1, X_test2], y_test)
print("Test Loss:", test_loss)



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Test Loss: 0.2385886013507843


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

# Assuming X_test1 and X_test2 are your test data
# Make predictions
y_pred = model.predict([X_test1, X_test2])
# print(y_pred)
# Assuming y_test is your ground truth labels
# Convert the predictions to binary values (0 or 1)
y_pred_binary = (y_pred > 0.5).astype(int)

# Reshape y_test if needed
# y_test = y_test.reshape(y_pred.shape)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred_binary)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred_binary)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, y_pred_binary)
print("F1 Score:", f1)


conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.63
Precision: 0.4166666666666667
Recall: 0.4838709677419355
F1 Score: 0.44776119402985076
Confusion Matrix:
[[48 21]
 [16 15]]


# Transformers for Semantic Similarity

## Softmax Loss

In [None]:
! pip install datasets
# installing datasets
# used to download and asses datasets from huggingface

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-a

In [None]:

import datasets
# we are using the snli dataset train part
snli = datasets.load_dataset('snli', split='train')
# using a small subset of that dataset because whole dataset
# takes large amount of time
dataset = snli[:50000]
dataset = datasets.Dataset.from_dict(dataset)

print(type(dataset))

Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

<class 'datasets.arrow_dataset.Dataset'>


In [None]:
print(f"before: {len(dataset)} rows")
dataset = dataset.filter(
    lambda x: 0 if x['label'] == -1 else 1
)
# the above uses lambda
print(f"after: {len(dataset)} rows")

# dataset contains -1 values in the label feature where no confident class could be assigned. We remove them using the filter method

before: 50000 rows


Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

after: 49947 rows


In [None]:
! pip install sentence_transformers
# installing sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers<5.0.0,>=4.6.0->sentence_transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━

In [None]:
from sentence_transformers import InputExample
# InputExample is used to represent training examples for training sentence embedding models

from tqdm.auto import tqdm  # so we see progress bar also

train_samples = []
# empty list ot store training samples
for row in tqdm(dataset):
    train_samples.append(InputExample(
        texts=[row['premise'], row['hypothesis']]
        # appending to train samples using keys premise and hypothesis
    ))

  0%|          | 0/49947 [00:00<?, ?it/s]

In [None]:
from torch.utils.data import DataLoader

batch_size = 16

loader = DataLoader(
    train_samples, shuffle=True, batch_size=batch_size)
# making a dataloader using list train_samples made before


In [None]:
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer('bert-base-uncased')
#  initialize a BERT model using the 'bert-base-uncased'
#  pre-trained BERT model from Hugging Face's model hub
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)
# Pooling module takes the dimension of the word embeddings produced by the BERT model
# (retrieved using bert.get_word_embedding_dimension())
# and specifies that you want to use mean pooling for token-level embeddings
model = SentenceTransformer(modules=[bert, pooler])

model

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
from sentence_transformers import losses
# importing softmax loss, arguments are model, embedding dimension and number of labels
loss = losses.SoftmaxLoss(model,sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2)

In [None]:
epochs = 5
warmup_steps = int(len(loader) * epochs * 0.1)

# The warmup_steps is a concept often used in training deep learning models with learning rate schedules.
# It's the number of initial training steps during which the learning rate gradually increases to its target value.
# We use 10% of the total number of steps in the training dataset as the number of warmup steps

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./sbert_test_b',
    show_progress_bar=True,
)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3122 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3122 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3122 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3122 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3122 [00:00<?, ?it/s]

<H2> Testing

In [None]:
import datasets

sts = datasets.load_dataset('glue', 'stsb', split='validation')

# GLUE is a benchmark for evaluating the performance of natural language understanding models on a variety of NLP tasks.
# It was introduced to assess the generalization capabilities of models across a range of language understanding tasks.
# GLUE includes various datasets and tasks, and it provides a unified framework for evaluating and comparing the performance of different NLP models.

# (Semantic Textual Similarity Benchmark) : STSB is a specific dataset within the GLUE benchmark
# t focuses on the task of evaluating the semantic similarity between pairs of sentences


sts

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})

In [None]:
sts = sts.map(lambda x: {'label': x['label'] / 5.0})
# rescaling from 0 to 1 as originally from 0 to 5

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
from sentence_transformers import InputExample
# making lists for testing
samples = []
for sample in sts:
    samples.append(InputExample(
        texts=[sample['sentence1'], sample['sentence2']],
        label=sample['label']
    ))

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    samples, write_csv=False
)

# EmbeddingSimilarityEvaluator class.
# samples: This is the variable containing input examples.
# The EmbeddingSimilarityEvaluator uses these examples to evaluate the similarity of sentence embeddings produced by a model.
# These input examples often consist of pairs of sentences or texts and their associated similarity scores.

Spearman's rank correlation : $\rho = 1 - \frac{6\sum{d_i^2}}{n(n^2 - 1)}$
<br>
1. Rank the data:
Rank the values in both X and Y separately, from lowest to highest. Ties (equal values) are assigned the average of the ranks they would receive if they were distinct values.
Calculate the differences:

2. For each data point, compute the difference between the ranks in X and the ranks in Y.

3. Square the differences:
Square each of the differences calculated in step 2.
Calculate the sum of squared differences:

Sum up the squared differences from step 3



In [None]:
from sentence_transformers import SentenceTransformer
# loading the model and evaluation
model = SentenceTransformer('./sbert_test_b')
evaluator(model)

# Spearman’s rank correlation

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.27796023581315976

# Transformers for Semantic Similarity
# MNR custom loss

In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.3-py3-none-a

In [None]:

import datasets

snli = datasets.load_dataset('snli', split='train')

dataset = snli[:50000]
dataset = datasets.Dataset.from_dict(dataset)


print(type(dataset))

Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

<class 'datasets.arrow_dataset.Dataset'>


In [None]:
print(dataset.column_names)
print(len(dataset))

['premise', 'hypothesis', 'label']
50000


In [None]:
print(f"before: {len(dataset)} rows")
dataset = dataset.filter(
    lambda x: True if x['label'] == 0 else False
)
print(f"after: {len(dataset)} rows")

before: 50000 rows


Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

after: 16679 rows


In [None]:
! pip install sentence_transformers


Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers<5.0.0,>=4.6.0->sentence_transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━

In [None]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm  # so we see progress bar

train_samples = []
for row in tqdm(dataset):
    train_samples.append(InputExample(
        texts=[row['premise'], row['hypothesis']]
    ))

  0%|          | 0/16679 [00:00<?, ?it/s]

In [None]:
from sentence_transformers import datasets

batch_size = 32

loader = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size)

In [None]:
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer('bert-base-uncased')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

model

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

##MNR Loss

1. Positive and Negative Pairs: Given a pair of positive examples (pairs that should have similar embeddings) and a set of negative examples (pairs that should have dissimilar embeddings), the loss function aims to minimize the distance (typically a cosine or Euclidean distance) between the embeddings of positive pairs and maximize the distance between the embeddings of negative pairs.

2. Margin: The loss function often includes a margin parameter, which specifies the minimum acceptable difference between the distances of positive and negative pairs. If the difference is less than the margin, it contributes to the loss.

3. Triplet Loss: The MultipleNegativesRankingLoss can be seen as a type of triplet loss, where you have an anchor example (A), a positive example (P), and multiple negative examples (N1, N2, N3, etc.). The loss encourages the distance between the anchor and positive example to be smaller than the distance between the anchor and any of the negative examples by at least the specified margin.

4. Objective Function: The objective is to minimize this loss function, which encourages semantically similar pairs to have similar embeddings and semantically dissimilar pairs to have dissimilar embeddings.

In [None]:
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./sbert_test_mnr2',
    show_progress_bar=True
)  # I set 'show_progress_bar=False' as it printed every step
#    on to a new line

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/521 [00:00<?, ?it/s]

<H2> Testing

In [None]:
import datasets

sts = datasets.load_dataset('glue', 'stsb', split='validation')

sts

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})

In [None]:
sts = sts.map(lambda x: {'label': x['label'] / 5.0})

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
!pip install sentence_transformers



In [None]:
from sentence_transformers import InputExample

samples = []
for sample in sts:
    samples.append(InputExample(
        texts=[sample['sentence1'], sample['sentence2']],
        label=sample['label']
    ))

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    samples, write_csv=False
)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('./sbert_test_mnr2')

evaluator(model)
# Spearman’s rank correlation

0.830610960375524

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Obtain similarity scores from the model
similarity_scores = evaluator.get_similarity_scores()

# Convert similarity scores to binary classes based on a threshold
threshold = 0.5  # Set a threshold for binary classification
predicted_labels = [1 if score > threshold else 0 for score in similarity_scores]

# Get ground truth labels
ground_truth_labels = [sample.label for sample in samples]

# Calculate classification metrics
accuracy = accuracy_score(ground_truth_labels, predicted_labels)
precision = precision_score(ground_truth_labels, predicted_labels)
recall = recall_score(ground_truth_labels, predicted_labels)
f1 = f1_score(ground_truth_labels, predicted_labels)
conf_matrix = confusion_matrix(ground_truth_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)


AttributeError: ignored

# Similarity for Hindi language

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3

In [None]:
from datasets import load_dataset

dataset = load_dataset("Harsit/xnli2.0_train_hindi", split='train')
dataset = dataset[:20000]



Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/175M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
import datasets
dataset = datasets.Dataset.from_dict(dataset)

In [None]:
print(f"before: {len(dataset)} rows")
dataset = dataset.filter(
    lambda x: True if x['label'] == 0 else False
)
print(f"after: {len(dataset)} rows")

before: 20000 rows


Filter:   0%|          | 0/20000 [00:00<?, ? examples/s]

after: 6780 rows


In [None]:
print(len(dataset))
# column_names = dataset["train"].column_names
print(dataset.column_names)

20000
['premise', 'hypothesis', 'label']


In [None]:
# print(f"before: {len(dataset)} rows")
# dataset = dataset.filter(
#     lambda x: True if x['label'] == 0 else False
# )
# print(f"after: {len(dataset)} rows")

new_dataset = dataset.map(lambda example: {'label': 0 if example['label'] == 0 else 1})


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [None]:
! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers<5.0.0,>=4.6.0->sentence_transformers)
  Do

In [None]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm  # so we see progress bar

train_samples = []
for row in tqdm(dataset):
    train_samples.append(InputExample(
        texts=[row['premise'], row['hypothesis']]
    ))

  0%|          | 0/6780 [00:00<?, ?it/s]

In [None]:

from sentence_transformers import datasets

batch_size = 16

loader = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size)

In [None]:
from sentence_transformers import models, SentenceTransformer

# model = SentenceTransformer("l3cube-pune/indic-sentence-similarity-sbert")


(…)6998eff31ade2eb64c8a40076/.gitattributes:   0%|          | 0.00/491 [00:00<?, ?B/s]

(…)f8765166998eff31ade2eb64c8a40076/LICENSE:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

(…)765166998eff31ade2eb64c8a40076/README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

(…)5166998eff31ade2eb64c8a40076/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

(…)kage/Data/com.apple.CoreML/model.mlmodel:   0%|          | 0.00/165k [00:00<?, ?B/s]

weight.bin:   0%|          | 0.00/532M [00:00<?, ?B/s]

(…)sk/float32_model.mlpackage/Manifest.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/532M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

(…)6998eff31ade2eb64c8a40076/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)31ade2eb64c8a40076/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)765166998eff31ade2eb64c8a40076/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [None]:
sentences = ['फेडरर ने 7वां विंबलडन खिताब जीत लिया है', 'फेडरर अपने करियर में कुल 20 ग्रैंडस्लैम खिताब जीत चुके है ']
embeddings = model.encode(sentences)
print(embeddings)

[[ 0.7052014  -0.15928048 -0.40575644 ...  0.06123352  0.35014322
  -0.6802305 ]
 [ 0.51917285 -0.04316334 -0.39702818 ... -0.05502839  0.23912817
  -0.708233  ]]


In [None]:
dimension = len(embeddings[0])
print(dimension)

768


In [None]:
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer('bert-base-uncased')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

model

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
from sentence_transformers import models, SentenceTransformer

bert = model
pooler = models.Pooling(
    dimension,
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

model

SentenceTransformer(
  (0): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./sbert_hindi',
    show_progress_bar=True
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/423 [00:00<?, ?it/s]

Testing

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.07M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
from datasets import load_dataset

dataset = load_dataset("Harsit/xnli2.0_train_hindi", split='train')
dataset = dataset[20000:30000]



In [None]:
import datasets
dataset = datasets.Dataset.from_dict(dataset)
print(len(dataset))
# print(dataset.column_names)

10000


In [None]:
new_dataset = dataset.map(lambda example: {'label': 0 if example['label'] == 0 else 1})


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
print(f"before: {len(dataset)} rows")
dataset = dataset.filter(
    lambda x: True if x['label'] == 0 else False
)
print(f"after: {len(dataset)} rows")

before: 5000 rows


Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

after: 1707 rows


In [None]:
from sentence_transformers import InputExample

samples = []
for sample in new_dataset:
    samples.append(InputExample(
        texts=[sample['premise'], sample['hypothesis']],
        label=sample['label']
    ))

In [None]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator.from_input_examples(
    samples, write_csv=False
)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('./sbert_hindi')

pred = evaluator(model, new_dataset['label'])
# Spearman’s rank correlation

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(pred)

0.5649044818019104


In [None]:
from datasets import load_dataset

dataset = load_dataset("ankitkupadhyay/xnli_hindi", split = 'test')
dataset = dataset[:1000]

In [None]:
import datasets
dataset = datasets.Dataset.from_dict(dataset)
print(len(dataset))

1000


In [None]:
new_dataset = dataset.map(lambda example: {'label': 0 if example['label'] == 0 else 1})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("./sbert_hindi")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from sentence_transformers import InputExample

samples = []
for sample in new_dataset:
    samples.append(InputExample(
        texts=[sample['premise'], sample['hypothesis']],
        label=sample['label']
    ))

In [None]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator.from_input_examples(
    samples, write_csv=False
)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('./sbert_hindi')

pred = evaluator(model)
# Spearman’s rank correlation
print(pred)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.5367056896019131


In [None]:
from sentence_transformers import SentenceTransformer, evaluation

binary_evaluator = evaluation.BinaryClassificationEvaluator(
    input_examples=samples,
    write_csv=False
)

TypeError: ignored

In [None]:
sentences  = ['फेडरर ने 7वां विंबलडन खिताब जीत लिया है', 'फेडरर अपने करियर में कुल 20 ग्रैंडस्लैम खिताब जीत चुके है ']

In [None]:
sentence1 = []
sentence2 = []
label = []
for sample in new_dataset:
  sentence1.append(sample['premise'])
  sentence2.append(sample['hypothesis'])
  label.append(sample['label'])

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

model = SentenceTransformer('./sbert_hindi')
import numpy as np


# Example data for binary classification
pred = []
threshold = 0.4
for threshold in np.arange(0.4,0.5,0.1):

  for i in range (len(sentence1)):
    emb1 = model.encode(sentence1[i])
    emb2 = model.encode(sentence2[i])
    cosine_sim = cosine_similarity([emb1], [emb2])
    if (cosine_sim[0][0]>threshold):
      pred.append(1)
    else:
      pred.append(0)
  accuracy = accuracy_score(label, pred)
  f1 = f1_score(label, pred)

  precision = precision_score(label, pred)
  recall = recall_score(label, pred)
  confusion = confusion_matrix(label, pred)

  print(f"Accuracy for {threshold} : ", accuracy)
  print(f"F1 Score for {threshold} :", f1)
  print(f"Precision for {threshold}:", precision)
  print(f"Recall for {threshold} :", recall)
  print(f"Confusion Matrix for {threshold}:\n", confusion)





Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


NameError: ignored

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix


# Calculate accuracy
accuracy = accuracy_score(label, pred)

# Calculate F1 score
f1 = f1_score(label, pred)

# Calculate precision
precision = precision_score(label, pred)

# Calculate recall
recall = recall_score(label, pred)

# Create a confusion matrix
confusion = confusion_matrix(label, pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", confusion)


Accuracy: 0.5487
F1 Score: 0.6982683693253995
Precision: 0.6268155083423359
Recall: 0.7881074554784183
Confusion Matrix:
 [[ 265 3109]
 [1404 5222]]


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("l3cube-pune/hindi-sentence-similarity-sbert")
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


# Example data for binary classification
pred = []
threshold = 0.5
for i in range (len(sentence1)):
  emb1 = model.encode(sentence1[i])
  emb2 = model.encode(sentence2[i])
  cosine_sim = cosine_similarity([emb1], [emb2])
  if (cosine_sim[0][0]>threshold):
    pred.append(1)
  else:
    pred.append(0)




model.safetensors:   0%|          | 0.00/950M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/950M [00:00<?, ?B/s]

(…)6eb2fda0798267/sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

(…)346eb2fda0798267/special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

(…)8dfd67de7346eb2fda0798267/tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

(…)e7346eb2fda0798267/tokenizer_config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

(…)339338dfd67de7346eb2fda0798267/vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

(…)338dfd67de7346eb2fda0798267/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix


# Calculate accuracy
accuracy = accuracy_score(label, pred)

# Calculate F1 score
f1 = f1_score(label, pred)

# Calculate precision
precision = precision_score(label, pred)

# Calculate recall
recall = recall_score(label, pred)

# Create a confusion matrix
confusion = confusion_matrix(label, pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", confusion)


Accuracy: 0.3066
F1 Score: 0.43872429982192
Precision: 0.4731145251396648
Recall: 0.4089948686990643
Confusion Matrix:
 [[ 356 3018]
 [3916 2710]]


In [None]:
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer('google/muril-base-cased')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


# Example data for binary classification
pred = []
threshold = 0.4
for i in range (len(sentence1)):
  emb1 = model.encode(sentence1[i])
  emb2 = model.encode(sentence2[i])
  cosine_sim = cosine_similarity([emb1], [emb2])
  if (cosine_sim[0][0]>threshold):
    pred.append(1)
  else:
    pred.append(0)




In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix


# Calculate accuracy
accuracy = accuracy_score(label, pred)

# Calculate F1 score
f1 = f1_score(label, pred)

# Calculate precision
precision = precision_score(label, pred)

# Calculate recall
recall = recall_score(label, pred)

# Create a confusion matrix
confusion = confusion_matrix(label, pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", confusion)


Accuracy: 0.6626
F1 Score: 0.7970648382052208
Precision: 0.6626
Recall: 1.0
Confusion Matrix:
 [[   0 3374]
 [   0 6626]]


# Random Testing Now

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [None]:
from datasets import load_dataset

dataset = load_dataset("Harsit/xnli2.0_train_hindi", split='train')
dataset = dataset[:20000]

import datasets
dataset = datasets.Dataset.from_dict(dataset)
print(f"before: {len(dataset)} rows")
dataset = dataset.filter(
    lambda x: True if x['label'] == 0 else False
)
print(f"after: {len(dataset)} rows")
new_dataset = dataset.map(lambda example: {'label': 0 if example['label'] == 0 else 1})

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/175M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

before: 20000 rows


Filter:   0%|          | 0/20000 [00:00<?, ? examples/s]

after: 6780 rows


Map:   0%|          | 0/6780 [00:00<?, ? examples/s]

In [None]:
! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=981ee5ef3251bf390cfee84e13bb007462afaf0f3f926044c7cbc442da3f35f0
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr

In [None]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm  # so we see progress bar

train_samples = []
for row in tqdm(dataset):
    train_samples.append(InputExample(
        texts=[row['premise'], row['hypothesis']]
    ))

  0%|          | 0/6780 [00:00<?, ?it/s]

In [None]:

from sentence_transformers import datasets

batch_size = 16

loader = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size)

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
# model = AutoModelForMaskedLM.from_pretrained("google/muril-base-cased")
# model = SentenceTransformer("google/muril-base-cased")

from transformers import AutoTokenizer, AutoModel

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google/muril-large-cased")
model = AutoModel.from_pretrained("google/muril-large-cased")



tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/406 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.03G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/muril-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
sentences = ['फेडरर ने 7वां विंबलडन खिताब जीत लिया है', 'फेडरर अपने करियर में कुल 20 ग्रैंडस्लैम खिताब जीत चुके है ']
embeddings = model.encode(sentences)
print(embeddings)

dimension = len(embeddings[0])
print(dimension)

AttributeError: ignored

In [None]:
from sentence_transformers import models, SentenceTransformer

bert = model
pooler = models.Pooling(
    dimension,
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

model

NameError: ignored

In [None]:
from sentence_transformers import models, SentenceTransformer

bert = models.Transformer('google/muril-base-cased')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
from sentence_transformers import losses

loss1 = losses.MultipleNegativesRankingLoss(model)
loss2 = losses.SoftmaxLoss(model,sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2)

In [None]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./sbert_hindi',
    show_progress_bar=True
)

TypeError: ignored

In [None]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

In [None]:
class MyDataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return {
            'texts': self.samples[idx].texts,
            'label': self.samples[idx].label
        }

In [None]:
my_dataset = MyDataset(train_samples)

# Use a DataLoader with the custom dataset
batch_size = 16
loader = DataLoader(my_dataset, batch_size=batch_size, shuffle=True)

In [None]:
class CombinedLoss(torch.nn.Module):
    def __init__(self, ranking_loss, softmax_loss, weight_ranking=0.5, weight_softmax=0.5):
        super(CombinedLoss, self).__init__()
        self.ranking_loss = ranking_loss
        self.softmax_loss = softmax_loss
        self.weight_ranking = weight_ranking
        self.weight_softmax = weight_softmax

    def forward(self, model_output, labels):
        ranking_loss_value = self.ranking_loss(model_output[0], model_output[1])
        softmax_loss_value = self.softmax_loss(model_output[0], model_output[1], labels)
        combined_loss = self.weight_ranking * ranking_loss_value + self.weight_softmax * softmax_loss_value
        return combined_loss

In [None]:
ranking_loss = losses.MultipleNegativesRankingLoss(model)
softmax_loss = losses.SoftmaxLoss(
    model='./sbert_hindi',
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2
)

# Instantiate the custom combined loss
# Set up your DataLoader
batch_size = 16
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)
combined_loss = softmax_loss
# Set up the optimizer
optimizer = Adam(model.parameters(), lr=1e-5)

# Train the model
model.fit(
    train_objectives=[(loader, ranking_loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./sbert_hindi1',
    show_progress_bar=True
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/423 [00:00<?, ?it/s]

In [None]:
.ranking_loss = losses.MultipleNegativesRankingLoss(model)
softmax_loss = losses.SoftmaxLoss(
    model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2
)

# Set up your DataLoader
batch_size = 16
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

# Set up the optimizer
optimizer = Adam(model.parameters(), lr=1e-5)

# Train the model
for epoch in range(epochs):
    model.train()
    for batch in tqdm(loader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()

        # Obtain the embeddings and labels from your batch
        text_data = batch['texts']
        embeddings = model.encode(text_data)

        # Assuming your dataset has labels associated with each example
        labels = batch['label']

        # Convert labels to a tensor (adjust this based on your data structure)
        labels_tensor = torch.tensor(labels, dtype=torch.long)  # Ensure labels are of type long

        # Compute the individual losses
        ranking_loss_value = ranking_loss(embeddings[0], embeddings[1], labels_tensor)
        softmax_loss_value = softmax_loss(embeddings[0], embeddings[1], labels_tensor)

        # Combine losses with weights
        weight = 0.5
        combined_loss = weight * ranking_loss_value + (1 - weight) * softmax_loss_value

        # Backward and optimize
        combined_loss.backward()
        optimizer.step()

# Save your model after training
model.save('./sbert_hindi')

Epoch 1:   0%|          | 0/423 [00:00<?, ?it/s]

TypeError: ignored

Epoch 1:   0%|          | 0/424 [00:00<?, ?it/s]

AttributeError: ignored

In [None]:
from datasets import load_dataset

dataset = load_dataset("Harsit/xnli2.0_train_hindi", split='train')
dataset = dataset[20000:30000]

import datasets
dataset = datasets.Dataset.from_dict(dataset)
print(len(dataset))
# print(dataset.column_names)
new_dataset = dataset.map(lambda example: {'label': 0 if example['label'] == 0 else 1})
print(f"before: {len(dataset)} rows")
dataset = dataset.filter(
    lambda x: True if x['label'] == 0 else False
)
print(f"after: {len(dataset)} rows")
from sentence_transformers import InputExample

samples = []
for sample in new_dataset:
    samples.append(InputExample(
        texts=[sample['premise'], sample['hypothesis']],
        label=sample['label']
    ))

from sentence_transformers.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator.from_input_examples(
    samples, write_csv=False
)

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('./sbert_hindi')

pred = evaluator(model, new_dataset['label'])
# Spearman’s rank correlation
print(pred)

10000


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

before: 10000 rows


Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

after: 3374 rows
0.5796276915163054


In [None]:
sentence1 = []
sentence2 = []
label = []
for sample in new_dataset:
  sentence1.append(sample['premise'])
  sentence2.append(sample['hypothesis'])
  label.append(sample['label'])

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

model = SentenceTransformer('./sbert_hindi1')
import numpy as np


# Example data for binary classification
pred = []
threshold = 0.45
# for threshold in np.arange(0.4,0.5,0.1):

for i in range (len(sentence1)):
  emb1 = model.encode(sentence1[i])
  emb2 = model.encode(sentence2[i])
  cosine_sim = cosine_similarity([emb1], [emb2])
  if (cosine_sim[0][0]>threshold):
    pred.append(1)
  else:
    pred.append(0)
accuracy = accuracy_score(label, pred)
f1 = f1_score(label, pred)

precision = precision_score(label, pred)
recall = recall_score(label, pred)
confusion = confusion_matrix(label, pred)

print(f"Accuracy for {threshold} : ", accuracy)
print(f"F1 Score for {threshold} :", f1)
print(f"Precision for {threshold}:", precision)
print(f"Recall for {threshold} :", recall)
print(f"Confusion Matrix for {threshold}:\n", confusion)


Accuracy for 0.45 :  0.5927
F1 Score for 0.45 : 0.7405897713521432
Precision for 0.45: 0.6406611570247934
Recall for 0.45 : 0.8774524600060368
Confusion Matrix for 0.45:
 [[ 113 3261]
 [ 812 5814]]


In [None]:
print(sentence1[45])
print(sentence2[45])
print(label[45])
print(pred[45])
# 0 same
# 1 different

फिर भी आप एक बात में सही हैं।
आप दुख की बात है कि हर चीज में बहुत गलत हैं।
1
1


In [None]:
print(sentence1[195])
print(sentence2[195])
print(label[195])
print(pred[195])
# 0 same 1 different

वे कुछ सीख सकते थे - ज्यादा नहीं, शायद, लेकिन कुछ - जीवंत, गंदा ऑनलाइन चर्चाओं से वे गायब हैं।
कुछ ऑनलाइन चर्चाएँ होती हैं जिनमें उन्हें शामिल नहीं किया जाता है।
0
1


In [None]:
index = 489
print(sentence1[index])
print(sentence2[index])
print(label[index])
print(pred[index])
# 0 same 1 different

बड़ी पर्वत श्रृंखलाओं में 1,200 मीटर (4,000 फीट) और ऊपर, या छोटे पृथक पहाड़ों पर 600 मीटर (2,000 फीट) जितना कम, बड़े पेड़ और लियाना लता मर्टल, लॉरेल और ओक के पेड़ों को रास्ता देते हैं।
पहाड़ छोटे हैं।
1
1


In [None]:
index =8
print(sentence1[index])
print(sentence2[index])
print(label[index])
print(pred[index])
# 0 same 1 different

FASAB संघीय रिपोर्टिंग संस्थाओं के लिए लेखांकन के व्यापक आधार के रूप में संहिताकरण को प्रकाशित कर रहा है।
FASAB संघीय रिपोर्टिंग संस्थाओं के लिए लेखांकन के व्यापक आधार के रूप में संहिताकरण को प्रकाशित कर रहा है।
0
1


In [None]:
model = SentenceTransformer("l3cube-pune/hindi-sentence-similarity-sbert")
from sentence_transformers import models, SentenceTransformer

bert = model
pooler = models.Pooling(
    768,
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

model

SentenceTransformer(
  (0): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./finetuned_l3cube_hindi',
    show_progress_bar=True
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/521 [00:00<?, ?it/s]

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

model = SentenceTransformer('./finetuned_l3cube_hindi')
import numpy as np


# Example data for binary classification
pred = []
threshold = 0.45
# for threshold in np.arange(0.4,0.5,0.1):

for i in range (len(sentence1)):
  emb1 = model.encode(sentence1[i])
  emb2 = model.encode(sentence2[i])
  cosine_sim = cosine_similarity([emb1], [emb2])
  if (cosine_sim[0][0]>threshold):
    pred.append(1)
  else:
    pred.append(0)
accuracy = accuracy_score(label, pred)
f1 = f1_score(label, pred)

precision = precision_score(label, pred)
recall = recall_score(label, pred)
confusion = confusion_matrix(label, pred)

print(f"Accuracy for {threshold} : ", accuracy)
print(f"F1 Score for {threshold} :", f1)
print(f"Precision for {threshold}:", precision)
print(f"Recall for {threshold} :", recall)
print(f"Confusion Matrix for {threshold}:\n", confusion)


Accuracy for 0.45 :  0.53
F1 Score for 0.45 : 0.6896050719852067
Precision for 0.45: 0.6130812588069516
Recall for 0.45 : 0.7879565348626623
Confusion Matrix for 0.45:
 [[  79 3295]
 [1405 5221]]
