In [None]:
import torch
from transformers import DistilBertForMaskedLM, DistilBertTokenizerFast, Trainer, TrainingArguments
import pandas as pd
import numpy as np
import re
from uptrain import Signal, UpTrainDataset, UpTrainFilter, UpTrainFramework
from collections import Counter
import matplotlib.pyplot as plt


# Define testing cases
test_cases = [
    "Nike shoes are very [MASK].",
    "I love wearing Nike [MASK].",
    "[MASK] is my favorite Nike product.",
    "The quality of Nike products is [MASK].",
    "The Nike store always has the latest [MASK]."
]


# Define data sources
nike_reviews_url = "https://www.kaggle.com/vik2012kvs/amazonreviews"
nike_tweets_url = "https://www.kaggle.com/parthsharma5795/nike-twitter-sentiment-dataset"


# Define UpTrain signals
def nike_positive_sentiment_func(row):
    return row["Sentiment"] == "Positive"

def nike_related_func(row):
    return "nike" in row["text"].lower()

nike_positive_sentiment_signal = Signal("Nike Positive Sentiment", nike_positive_sentiment_func)
nike_related_signal = Signal("Nike Related", nike_related_func)


# Define function to load data
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data


# Load Nike reviews and Nike tweets data
nike_reviews_data = load_data(nike_reviews_url)
nike_tweets_data = load_data(nike_tweets_url)


# Filter relevant data using UpTrain
uptrain_framework = UpTrainFramework()

nike_reviews_filtered_data = uptrain_framework.filter_dataset(
    UpTrainDataset(nike_reviews_data),
    [nike_positive_sentiment_signal]
)

nike_tweets_filtered_data = uptrain_framework.filter_dataset(
    UpTrainDataset(nike_tweets_data),
    [nike_related_signal]
)


# Concatenate and clean the data
nike_data = pd.concat([nike_reviews_filtered_data.data, nike_tweets_filtered_data.data])
nike_data.drop_duplicates(inplace=True)
nike_data.dropna(inplace=True)

def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.lower()  # Convert text to lowercase
    return text.strip()

nike_data["text"] = nike_data["text"].apply(clean_text)


# Generate the retraining dataset
def generate_retraining_dataset(data, mask_token="[MASK]"):
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    inputs = []
    labels = []
    for index, row in data.iterrows():
        text = row["text"]
        tokenized_text = tokenizer(text, return_tensors="pt")
        tokenized_text = {k: v.squeeze(0) for k, v in tokenized_text.items()}
        masked_index = torch.where(tokenized_text["input_ids"] == tokenizer.mask_token_id)[1]
        label_ids = tokenized_text["input_ids"].clone()
        label_ids[masked_index] = -100
        inputs.append(tokenized_text)
        labels.append(label_ids)
    return inputs, labels


inputs, labels = generate_retraining_dataset(nike_data)


# Define the UpTrain Framework configuration
config = {
    "framework": "uptrain",
    "num_samples": 3000,
    "signals": [nike_positive_sentiment_signal, nike_related_signal],
    "filters": [UpTrainFilter("text", "min", 3)],
    "train_ratio": 0.8,
    "validation_ratio": 0.1,
    "test_ratio": 0.1,
    "shuffle": True,
    "random_seed": 42
}

# Create the UpTrain framework
uptrain_framework = UpTrainFramework(config)

# Generate the retraining dataset with filtered data
retraining_dataset = uptrain_framework.generate_retraining_dataset(
    nike_data,
    generate_retraining_dataset,
    "text",
    mask_token="[MASK]"
)

# Split the dataset into training, validation, and test sets
train_dataset, val_dataset, test_dataset = uptrain_framework.split_dataset(retraining_dataset)

# Define the model configuration and model
model_name = "distilbert-base-uncased"
model_config = {"output_hidden_states": True}
model = DistilBertForMaskedLM.from_pretrained(model_name, config=model_config)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-tune the model
trainer.train()

# Evaluate the model on the test set
test_results = trainer.evaluate(test_dataset)

# Display the results
print("Test loss:", test_results["eval_loss"])

