In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from datasets import load_dataset
from random import shuffle
import spacy
import re

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
dataset = load_dataset("google-research-datasets/paws", "labeled_final")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.79k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
def tokenize_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z\s]', '', sentence)
    doc = nlp(sentence)
    tokens = [
        token.lemma_  # Lemmatized token
        for token in doc
        if token.is_alpha and not token.is_stop  # Keep only alphabetic and non-stopwords
    ]
    return tokens


# Prepare TaggedDocument data
tagged_data = []
for idx, example in enumerate(dataset['train']):
    tagged_data.append(TaggedDocument(words=tokenize_sentence(example['sentence1']), tags=[f"s1_{idx}"]))
    tagged_data.append(TaggedDocument(words=tokenize_sentence(example['sentence2']), tags=[f"s2_{idx}"]))

# Shuffle the data
shuffle(tagged_data)

In [None]:
vector_size = 300
doc2vec_model = Doc2Vec(tagged_data, vector_size=vector_size, window=2, min_count=1, workers=4, epochs=40)

# Infer embeddings for similarity
similarities = []
labels = []

for example in dataset['train']:
    sentence1_vector = doc2vec_model.infer_vector(tokenize_sentence(example['sentence1']))
    sentence2_vector = doc2vec_model.infer_vector(tokenize_sentence(example['sentence2']))
    similarity = cosine_similarity([sentence1_vector], [sentence2_vector])[0][0]
    similarities.append(similarity)
    labels.append(example['label'])

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Prepare for classification
X = np.array(similarities).reshape(-1, 1)
y = np.array(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],        # Regularization type
    'solver': ['liblinear']         # Solver for l1 and l2
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    LogisticRegression(),
    param_grid,
    scoring='accuracy',  # Use accuracy as the evaluation metric
    cv=7,                # 5-fold cross-validation
    verbose=1            # Print progress
)

# Perform the grid search on training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Use the best model to make predictions on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model on the test set
print("\nTest Set Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 7 folds for each of 10 candidates, totalling 70 fits
Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.5882084981068498

Test Set Accuracy: 0.590932091893533

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.89      0.71      5549
           1       0.60      0.21      0.31      4332

    accuracy                           0.59      9881
   macro avg       0.59      0.55      0.51      9881
weighted avg       0.59      0.59      0.53      9881

