In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U transformers

In [None]:
!pip install --upgrade --force-reinstall transformers

In [None]:
!pip uninstall torch torchvision

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# 1. Install
!pip install scikit-learn pandas

In [None]:
# ===== 2. Import Libraries =====
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import warnings
import logging

In [None]:
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Check if CUDA is available
if torch.cuda.is_available():
    logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    logger.info("CUDA not available, using CPU")

# ===== 3. Load and Prepare Data =====
df = pd.read_json("/content/drive/MyDrive/WDC/80pair/wdcproducts80cc20rnd000un_train_medium_ditto_80.jsonl", lines=True)
print(df.head())


# # Assuming file paths are correct and accessible
# file1_path = "/content/drive/MyDrive/WDC/20pair/wdcproducts20cc80rnd000un_train_medium_ditto_20.jsonl"
# file2_path = "/content/drive/MyDrive/WDC/50pair/wdcproducts50cc50rnd000un_train_medium_ditto_50.jsonl"
# file3_path = "/content/drive/MyDrive/WDC/80pair/wdcproducts80cc20rnd000un_train_medium_ditto_80.jsonl"


# try:
#     df1 = pd.read_json(file1_path, lines=True)
#     df2 = pd.read_json(file2_path, lines=True)
#     df3 = pd.read_json(file3_path, lines=True)

#     # Concatenate the dataframes
#     df = pd.concat([df1, df2, df3], ignore_index=True)

#     print(df.head())

# except FileNotFoundError:
#     print(f"Error: One or more files not found. Please check the file paths.")
# except Exception as e:
#     print(f"An error occurred: {e}")


In [None]:
# ===== 4. Configurations =====
MODEL_NAME = 'microsoft/deberta-v3-small'  # Using smaller model to avoid memory issues
EPOCHS = 3
BATCH_SIZE = 8  # Reduced batch size for more stable training
MAX_LENGTH = 196  # Reduced max length for efficiency
OUTPUT_DIR = './deberta_product_matching'
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f'{OUTPUT_DIR}/logs', exist_ok=True)

In [None]:
# ===== 5. Dataset Class =====
class ProductMatchingDataset(Dataset):
    def __init__(self, texts_left, texts_right, labels, tokenizer, max_len):
        self.texts_left = texts_left
        self.texts_right = texts_right
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text_left = str(self.texts_left[idx])
        text_right = str(self.texts_right[idx])

        encoding = self.tokenizer(
            text_left,
            text_right,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ===== 6. Prepare Data =====
logger.info("Preparing train/validation split...")
train_texts_left, val_texts_left, train_texts_right, val_texts_right, train_labels, val_labels = train_test_split(
    df['text_left'].tolist(),
    df['text_right'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

logger.info(f"Train set: {len(train_labels)} samples")
logger.info(f"Validation set: {len(val_labels)} samples")

logger.info("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ===== 7. Manual Training Loop (Alternative to Huggingface Trainer) =====
logger.info(f"Loading {MODEL_NAME} model...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create datasets
train_dataset = ProductMatchingDataset(train_texts_left, train_texts_right, train_labels, tokenizer, MAX_LENGTH)
val_dataset = ProductMatchingDataset(val_texts_left, val_texts_right, val_labels, tokenizer, MAX_LENGTH)

# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [None]:
# Training loop
print("Starting training...")
best_accuracy = 0.0

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")

    # Training
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

    train_loss /= len(train_dataloader)
    print(f"Training loss: {train_loss:.4f}")

    # Evaluation
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            val_loss += loss.item()

            # Get predictions
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = batch['labels'].cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    val_loss /= len(val_dataloader)

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)

    print(f"Validation loss: {val_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Save best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        print(f"New best accuracy: {best_accuracy:.4f}. Saving model...")
        model_path = os.path.join(OUTPUT_DIR, "best_model")
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)

# ===== 8. Save Final Model =====
print("Saving final model...")
final_model_path = os.path.join(OUTPUT_DIR, "final_model")
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

# ===== 9. Inference Function =====
def predict_match(model, tokenizer, text_left, text_right, device):
    model.eval()
    inputs = tokenizer(
        text_left,
        text_right,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=1).item()
        confidence = predictions[0][predicted_class].item()

    result = "Match" if predicted_class == 1 else "No Match"
    return result, confidence



In [None]:
# ===== 10. Test Inference =====
print("\nTesting inference with sample data...")
val_df = pd.read_json("/content/drive/MyDrive/WDC/80pair/wdcproducts80cc20rnd000un_valid_medium_ditto_80.jsonl", lines=True)
if len(val_df) > 0:
    sample_idx = 0
    sample_left = val_df['text_left'].iloc[sample_idx]
    sample_right = val_df['text_right'].iloc[sample_idx]
    expected_label = val_df['label'].iloc[sample_idx]

    print(f"Product 1: {sample_left}")
    print(f"Product 2: {sample_right}")

    result, confidence = predict_match(model, tokenizer, sample_left, sample_right, device)
    print(f"Prediction: {result} (Confidence: {confidence:.2f})")
    print(f"Expected: {'Match' if expected_label == 1 else 'No Match'}")

print("\nTraining complete! Model saved to:")
print(f"- Best model: {os.path.join(OUTPUT_DIR, 'best_model')}")
print(f"- Final model: {os.path.join(OUTPUT_DIR, 'final_model')}")

In [None]:
import os
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Path to your model and validation dataset
MODEL_PATH = os.path.join(OUTPUT_DIR, "final_model")  # or "best_model"
VAL_DATASET_PATH = "/content/drive/MyDrive/WDC/80pair/wdcproducts80cc20rnd000un_valid_medium_ditto_80.jsonl"
OUTPUT_CSV_PATH = os.path.join(OUTPUT_DIR, "validation_predictions.csv")
MAX_LENGTH = 128  # Use the same max length as during training

# Load the validation dataset
print(f"Loading validation dataset from {VAL_DATASET_PATH}...")
val_df = pd.read_json(VAL_DATASET_PATH, lines=True)
print(f"Loaded {len(val_df)} validation samples")

# Load model and tokenizer
print(f"Loading model from {MODEL_PATH}...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

# Function to predict match
def predict_match(model, tokenizer, text_left, text_right, device):
    model.eval()
    inputs = tokenizer(
        text_left,
        text_right,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=1).item()
        confidence = predictions[0][predicted_class].item()

    result = "Match" if predicted_class == 1 else "No Match"
    return result, confidence

# Make predictions on the entire validation set
print("Making predictions on the validation set...")
predictions = []
confidences = []
all_preds_numeric = []
all_labels_numeric = []

for idx, row in val_df.iterrows():
    if idx % 100 == 0:
        print(f"Processing sample {idx}/{len(val_df)}...")

    text_left = str(row['text_left'])
    text_right = str(row['text_right'])

    result, confidence = predict_match(model, tokenizer, text_left, text_right, device)
    predictions.append(result)
    confidences.append(confidence)

    # Convert to numeric for metrics calculation
    pred_numeric = 1 if result == "Match" else 0

    # Handle the label properly regardless of whether it's an int or string
    if isinstance(row['label'], str):
        label_numeric = 1 if row['label'].lower() in ['1', 'true'] else 0
    else:
        label_numeric = int(row['label'])

    all_preds_numeric.append(pred_numeric)
    all_labels_numeric.append(label_numeric)

# Create results dataframe
results_df = pd.DataFrame({
    'text_left': val_df['text_left'],
    'text_right': val_df['text_right'],
    'expected': ['Match' if label == 1 else 'No Match' for label in all_labels_numeric],
    'prediction': predictions,
    'confidence': confidences
})

# Save to CSV
results_df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Predictions saved to {OUTPUT_CSV_PATH}")

# Calculate and print metrics
accuracy = accuracy_score(all_labels_numeric, all_preds_numeric)
precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels_numeric, all_preds_numeric, average='binary', zero_division=0
)

print("\nOverall Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nPrediction complete!")

In [None]:
from pathlib import Path

# Check if previously saved model exists
model_path = Path(os.path.join(OUTPUT_DIR, "best_model"))

if model_path.exists():
    logger.info(f"Loading model from {model_path}...")
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
else:
    logger.info(f"Loading fresh model from {MODEL_NAME}...")
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
