<a href="https://colab.research.google.com/github/BeniwalAnkit/Semantic-Semalarity/blob/main/enterpret.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Insights

In [None]:
import pandas as pd

eval_df = pd.read_excel("evaluation.xlsx")
train_df = pd.read_excel("train.xlsx")

In [None]:
print("Train size:", train_df.shape)
print("Evaluation size:", eval_df.shape)

Train size: (2061, 3)
Evaluation size: (9000, 3)


In [None]:
print("Train distribution:\n", train_df['label'].value_counts(normalize=True))
print("Evaluation distribution:\n", eval_df['label'].value_counts(normalize=True))

Train distribution:
 1    1.0
Name: label, dtype: float64
Evaluation distribution:
 0    0.666556
1    0.333444
Name: label, dtype: float64


In [None]:
train_df.head(10)

Unnamed: 0,text,reason,label
0,this is an amazing app for online classes!but,good app for conducting online classes,1
1,very practical and easy to use,app is user-friendly,1
2,this app is very good for video conferencing.,good for video conferencing,1
3,i can not download this zoom app,unable to download zoom app,1
4,i am not able to download this app,want to download the app,1
5,zoom is not working properly,app is not working,1
6,zoom is an excellent meeting app.,good app for conducting online meeting,1
7,i am not getting the virtual background option...,unable to switch virtual background,1
8,video quality is very poor,video quality is poor,1
9,i am unable to sign-in,want to login,1


In [None]:
eval_df.head(10)

Unnamed: 0,text,reason,label
0,the app is crashing when i play a vedio,app crashes during playback,1
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0
2,very helpful when and home working remotley,good app for work,0
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0
4,one of my favorite apps,good for spending time,0
5,I have enjoyed watching my favorite shows and ...,good to watch shows,1
6,we have not had internet for a month and a hal...,unable to access live tv,0
7,this is great app when you download the video ...,good app to download videos,1
8,"excellent app for video conferencing, the only...",want to record on ipad,0
9,"also, i can continue watching where i left off.",unable to sync watched episodes,0


## Baseline Approach

In [None]:
def baseline_predict(text, reason):
    text_words = set(text.lower().split())
    reason_words = set(reason.lower().split())
    
    if len(text_words.intersection(reason_words)) > 0:
        return 1
    else:
        return 0

In [None]:
y_true = eval_df['label'].values
y_pred = [baseline_predict(text, reason) for text, reason in zip(eval_df['text'], eval_df['reason'])]

from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.34      0.48      5999
           1       0.39      0.85      0.54      3001

    accuracy                           0.51      9000
   macro avg       0.61      0.60      0.51      9000
weighted avg       0.68      0.51      0.50      9000



## Training Approach

In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1
Looking in in

In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EvalPrediction
from sklearn.metrics import precision_recall_fscore_support, classification_report
from datasets import Dataset
import random

In [None]:
# For generating negative samples

def generate_negative_samples(dataset, num_negative_samples):
    negative_samples = []
    positive_examples = dataset[dataset['label'] == 1]
    
    for _ in range(num_negative_samples):
        # Randomly select a positive example
        positive_example = random.choice(positive_examples.values)
        
        # Randomly select a different reason
        negative_reason = random.choice(positive_examples['reason'].values)
        
        # Create a negative example
        negative_sample = list(positive_example[:1]) + [negative_reason, 0]
        negative_samples.append(negative_sample)
    
    return pd.DataFrame(negative_samples, columns=dataset.columns)

In [None]:
# Generate 2061 negative samples
num_negative_samples = 2061
negative_dataset = generate_negative_samples(train_df, num_negative_samples)

# Concatenate positive and negative datasets
new_traindf = pd.concat([train_df, negative_dataset], ignore_index=True)

# Shuffle the new dataset
new_traindf = new_traindf.sample(frac=1).reset_index(drop=True)

In [None]:
new_traindf

Unnamed: 0,text,reason,label
0,"excellent app, but its licenses are very expen...",license fees are expensive,1
1,the video is regularly frozen,want to upload backgrounds,0
2,s22 ultra does not work.,features are missing,0
3,"in this situation, it is not possible to pay f...",want to get subscription in russia,1
4,none of my other movie apps have this problem,unable to play movies,1
...,...,...,...
4056,very good application developers please turn o...,want to turn off two step authentication,0
4057,"I am so sad, and light disappoint because this...",want to change background on iphone,1
4058,"another ""brilliant"" development of russian pro...",unable to use on s22 ultra,0
4059,i hate how the screen goes black or flickers.,messaging function is not working,0


In [None]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125926 sha256=f64749a1467bee77f499ab80ede631d7a64e33847b62a276b77ac4f0e92aed64
  Stored in directory: /root/.cache/pi

In [None]:
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import torch.nn.functional as F

In [None]:
X = []
y = []

for index, row in new_traindf.iterrows():
    text = row['text']
    reason = row['reason']
    encoded_text = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    encoded_reason = tokenizer(reason, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        text_embeddings = model(**encoded_text).last_hidden_state.mean(dim=1)
        reason_embeddings = model(**encoded_reason).last_hidden_state.mean(dim=1)
        similarity = F.cosine_similarity(text_embeddings, reason_embeddings)
    X.append(similarity.numpy())
    y.append(new_traindf['label'])
    y_arr = np.array(y[0])

In [None]:
# Train logistic regression model
clf = LogisticRegression(random_state=0)
clf.fit(X, y_arr)

In [None]:
X_eval = []
y_true = []

for index, row in eval_df.iterrows():
    text = row['text']
    reason = row['reason']
    encoded_text = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    encoded_reason = tokenizer(reason, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        text_embeddings = model(**encoded_text).last_hidden_state.mean(dim=1)
        reason_embeddings = model(**encoded_reason).last_hidden_state.mean(dim=1)
        similarity = F.cosine_similarity(text_embeddings, reason_embeddings)
    X_eval.append(similarity.numpy())
    y_true.append(eval_df['label'])
    y_true_arr = np.array(y_true[0])

y_pred = clf.predict(X_eval)

In [None]:
# Evaluate model
acc = accuracy_score(y_true_arr, y_pred)
prec = precision_score(y_true_arr, y_pred)
rec = recall_score(y_true_arr, y_pred)
f1 = f1_score(y_true_arr, y_pred)

print(f"Accuracy: {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {rec:.2f}")
print(f"F1 score: {f1:.2f}")

Accuracy: 0.61
Precision: 0.44
Recall: 0.63
F1 score: 0.52


In [None]:
# Train Gaussian Naive Bays model

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X, y_arr)

In [None]:
y_gauss = classifier.predict(X_eval)

In [None]:
# Evaluate model
acc = accuracy_score(y_true_arr, y_gauss)
prec = precision_score(y_true_arr, y_gauss)
rec = recall_score(y_true_arr, y_gauss)
f1 = f1_score(y_true_arr, y_gauss)

print(f"Accuracy: {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {rec:.2f}")
print(f"F1 score: {f1:.2f}")

Accuracy: 0.63
Precision: 0.46
Recall: 0.58
F1 score: 0.51


In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
classifier_R = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_R.fit(X, y_arr)

In [None]:
y_RForest = classifier_R.predict(X_eval)

In [None]:
# Evaluate model
acc = accuracy_score(y_true_arr, y_RForest)
prec = precision_score(y_true_arr, y_RForest)
rec = recall_score(y_true_arr, y_RForest)
f1 = f1_score(y_true_arr, y_RForest)

print(f"Accuracy: {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {rec:.2f}")
print(f"F1 score: {f1:.2f}")

Accuracy: 0.59
Precision: 0.41
Recall: 0.56
F1 score: 0.47


## Error Analysis

In [None]:
from torch.utils.data import DataLoader


In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [InputExample(new_traindf)]
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=5)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
train_loss

CosineSimilarityLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
  (loss_fct): MSELoss()
  (cos_score_transformation): Identity()
)

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, losses
from sentence_transformers.readers import InputExample

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [InputExample(new_traindf)]
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=10)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=2)

In [None]:
print(train_loss)

SoftmaxLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
  (classifier): Linear(in_features=2304, out_features=2, bias=True)
  (loss_fct): CrossEntropyLoss()
)


In [None]:
def generate_negatives(df, multiplier=1):
    negative_df = df.copy()
    for _ in range(multiplier):
        negative_df['reason'] = negative_df['reason'].apply(lambda x: ' '.join(random.sample(x.split(), len(x.split()))))
    negative_df['label'] = 0
    return pd.concat([df, negative_df], ignore_index=True)

def preprocess_dataset(df, tokenizer):
    def encode(example):
        inputs = tokenizer(example['text'], example['reason'], padding=True, truncation=True, max_length=512, return_tensors='pt')
        return {k: v.squeeze(0) for k, v in inputs.items()}
    
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(encode, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset
def compute_metrics(eval_pred: EvalPrediction):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'precision': precision, 'recall': recall, 'f1': f1}

train_df
eval_df

train_df = generate_negatives(train_df, multiplier=1)

models = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base']
model_results = {}

for model_name in models:
    print(f"Training and evaluating {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_dataset = preprocess_dataset(train_df, tokenizer)
    eval_dataset = preprocess_dataset(eval_df, tokenizer)
    
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir=f'./logs/{model_name}',
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()  # This line was missing in your code

    # Error Analysis
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    report = classification_report(eval_dataset['label'], preds, output_dict=True)
    model_results[model_name] = report

print("Error analysis:")
for model_name, report in model_results.items():
    print(f"Model: {model_name}")
    print(report)