In [None]:
import os
import random
import torch
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np


random.seed(42)
torch.manual_seed(42)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)


def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)


def load_and_preprocess_data():
    with open('/content/cleaned_train.txt', 'r', encoding='utf-8') as file:
        texts = file.readlines()
    return Dataset.from_list([{'text': text.strip()} for text in texts])

unlabeled_dataset = load_and_preprocess_data()
unlabeled_dataset = unlabeled_dataset.map(preprocess_function, batched=True)


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)


training_args = TrainingArguments(
    output_dir='/content/results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-6,
    logging_dir='/content/logs',
    logging_steps=500,
    report_to="none",
    fp16=True


def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)


    accuracy = accuracy_score(labels, predictions)


    macro_f1 = f1_score(labels, predictions, average='macro')


    precision = precision_score(labels, predictions, average='macro', zero_division=0)
    recall = recall_score(labels, predictions, average='macro', zero_division=0)

    return {
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'precision': precision,
        'recall': recall
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=unlabeled_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()


from transformers import BertForSequenceClassification


classification_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)
classification_model.load_state_dict(model.state_dict(), strict=False)


def load_test_data():
    texts = []
    labels = []
    with open('/content/processed_test.txt', 'r', encoding='utf-8') as file:
        for line in file:
            *text, label = line.strip().split()
            labels.append(int(label))
            texts.append(' '.join(text))
    return Dataset.from_dict({'text': texts, 'label': labels})

test_dataset = load_test_data()
test_dataset = test_dataset.map(preprocess_function, batched=True)


test_trainer = Trainer(
    model=classification_model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


eval_results = test_trainer.evaluate(eval_dataset=test_dataset)
print(eval_results)


def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = classification_model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return "Positive" if prediction == 2 else "Neutral" if prediction == 1 else "Negative"


sample_text = "This movie was fantastic! I loved the acting and the story."
print("Prediction:", predict_sentiment(sample_text))
