In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report

# Loading the Dataset from Excel
file_path = "C:/Users/Anagha/Desktop/Training_Data_2.xlsx"  # Replace with your file path
df = pd.read_excel(file_path)

# Print the columns of the DataFrame to understand its structure
print(df.columns)


# Ensure the dataset has 'text' and 'label' columns
# Replace column names with the actual column names in your file
df = df.rename(columns={"cleaned_comment": "text", "Sentiment_Num": "label"})

# Drop rows where 'text' or 'label' are missing
df = df.dropna(subset=["text", "label"])

# Remove rows with non-finite values in 'label'
df = df[df["label"].apply(lambda x: np.isfinite(x))]

# Convert 'label' from float to integer type
df["label"] = df["label"].astype(int)

# Check unique labels and data type
print("Unique labels (after cleaning):", df["label"].unique())
print("Label data type:", df["label"].dtype)

# Convert the cleaned DataFrame to a Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Tokenize the Dataset
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Apply the tokenization function to the dataset in batches
tokenized_dataset = dataset.map(tokenize_data, batched=True)

# Split the dataset into training and testing sets (90% train, 10% test)
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
test_dataset = split["test"]


print("Train dataset labels:", train_dataset["label"])
print("Test dataset labels:", test_dataset["label"])

# Remove unnecessary columns from the datasets to keep only the required data
train_dataset = train_dataset.remove_columns(["text", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["text", "__index_level_0__"])

# Load the Pre-trained Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=df["label"].nunique())

# Define Training Arguments and # Set up the training parameters
training_args = TrainingArguments(
    # Where to save model outputs and checkpoints
    output_dir="./results",
    # Check model performance at the end of each training cycle
    evaluation_strategy="epoch",
    # Save the model after each training cycle
    save_strategy="epoch",
    # How fast the model learns
    learning_rate=5e-5,
    # Number of samples processed at once during training
    per_device_train_batch_size=16,
    # Number of samples processed at once during evaluation
    per_device_eval_batch_size=16,
    # Total number of times the model will see the training data
    num_train_epochs=3,
    # Regularization to prevent overfitting
    weight_decay=0.01,
    # Where to save logs of training progress
    logging_dir="./logs",
    # How often to log training metrics
    logging_steps=10,
    # Use the best model found during training at the end
    load_best_model_at_end=True,
    # Metric used to determine the best model
    metric_for_best_model="eval_loss",
)

# Function to compute evaluation metrics
def compute_metrics(eval_pred):
    # Unpack predictions and true labels
    logits, labels = eval_pred
    # Get predicted class indices
    preds = np.argmax(logits, axis=-1)
    # Calculate accuracy
    return {"accuracy": (preds == labels).mean()}

# Create a Trainer instance with the model, training settings, datasets, and metric function
trainer = Trainer(
    # The model to train
    model=model,
    # Training settings
    args=training_args,
    # Training data
    train_dataset=train_dataset,
    # Evaluation data
    eval_dataset=test_dataset,
    # Tokenizer for processing text
    tokenizer=tokenizer,
    # Function to calculate metrics
    compute_metrics=compute_metrics,
)

# Training the Model
trainer.train()

# Evaluating the Model
# Get predictions for the test data
predictions = trainer.predict(test_dataset)
# Determine predicted classes
preds = np.argmax(predictions.predictions, axis=-1)
print("Evaluation Metrics:\n", classification_report(test_dataset["label"], preds))

# Save the trained model and tokenizer for later use
model.save_pretrained("fine_tuned_bert")
tokenizer.save_pretrained("fine_tuned_bert")

# Using the Fine-Tuned Model for Inference
from transformers import pipeline

# Create a sentiment analysis pipeline with the fine-tuned model
sentiment_pipeline = pipeline("text-classification", model="fine_tuned_bert")

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,pipeline
from sklearn.metrics import classification_report

# Load the pre-trained BERT model and tokenizer that were saved earlier
model_path = "fine_tuned_bert"  # Path to the saved fine-tuned model
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Load the New XLSX File with the actual testing data
new_file_path = "C:/Users/Anagha/Desktop/Youtube_Comments_Cleaned_Output.xlsx"  # Replace with your file path
new_df = pd.read_excel(new_file_path)

# Ensure the DataFrame Has a 'text' Column
# Replace 'cleaned_comment' with the actual column name in your new file
new_df = new_df.rename(columns={"cleaned_comment": "text"})

# Drop rows where 'text' is missing
new_df = new_df.dropna(subset=["text"])

# Using the Sentiment Pipeline for Predictions
# Create a sentiment analysis pipeline using the fine-tuned model and tokenizer
sentiment_pipeline = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    truncation=True,  # Add truncation to handle long sequences
    max_length=512,   # Set the maximum length of input sequences(BERT Requirement)
)

#Function to split long comments into smaller chunks
def split_long_comments(text, max_length=512):
    #Splits long text into chunks smaller than max_length tokens.
    tokens = tokenizer(text, truncation=False)["input_ids"]
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

# Apply splitting to long comments 
new_comments = new_df["text"].tolist()  # Convert the 'text' column to a list
# Convert the 'text' column to a list
processed_comments = []
# Check if comment exceeds the token limit
for comment in new_comments:
    if len(tokenizer(comment)["input_ids"]) > 512:  # Check if comment exceeds token limit
        processed_comments.extend(split_long_comments(comment))  # Split long comments
    else:
        processed_comments.append(comment)  # Add short comments as is

# Get the sentiment predictions for the processed comments
results = sentiment_pipeline(processed_comments)

# Map results back to the original DataFrame
# Create a list to store predicted labels for each comment
predicted_labels = []
# Track the current index in the results list
current_index = 0

# Loop through each original comment to assign predictions
for comment in new_comments:
    if len(tokenizer(comment)["input_ids"]) > 512:
        split_chunks = split_long_comments(comment)
        # Get the predictions for these chunks from the results
        chunk_predictions = results[current_index:current_index + len(split_chunks)]
        # Combine chunk predictions 
        # Use majority voting to determine the final label for the long comment
        # Count occurrences of each label
        combined_label = max(set([pred["label"] for pred in chunk_predictions]), key=lambda x: [pred["label"] for pred in chunk_predictions].count(x))
        # Add the combined label to the predicted labels list
        predicted_labels.append(combined_label)
        current_index += len(split_chunks)
    else:
        predicted_labels.append(results[current_index]["label"])
        current_index += 1

# Add the predicted labels as a new column in the DataFrame
new_df["predicted_label"] = predicted_labels

#Saving the Results to a New Excel File
output_file_path = "C:/Users/Anagha/Desktop/Predicted_Test_Data2.xlsx"  # Replace with desired output file path
new_df.to_excel(output_file_path, index=False)

# Print a preview of the DataFrame
print(new_df.head())
