In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report

# Loading the Dataset from Excel
file_path = "C:/Users/Anagha/IML_FinalProject/Training_data_Comments.xlsx"  
df = pd.read_excel(file_path)

# Print the columns of the DataFrame to understand its structure
print(df.columns)


# Ensure the dataset has 'text' and 'label' columns
df = df.rename(columns={"cleaned_comment": "text", "Sentiment_Num": "label"})

# Drop rows where 'text' or 'label' are missing
df = df.dropna(subset=["text", "label"])

# Remove rows with non-finite values in 'label'
df = df[df["label"].apply(lambda x: np.isfinite(x))]

# Convert 'label' from float to integer type
df["label"] = df["label"].astype(int)

# Check unique labels and data type
print("Unique labels (after cleaning):", df["label"].unique())
print("Label data type:", df["label"].dtype)

# Convert the cleaned DataFrame to a Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Tokenize the Dataset
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Apply the tokenization function to the dataset in batches
tokenized_dataset = dataset.map(tokenize_data, batched=True)

# Split the dataset into training and testing sets (Training on 95% of data, testing later on a new file)
split = tokenized_dataset.train_test_split(test_size=0.05)
train_dataset = split["train"]
test_dataset = split["test"]


print("Train dataset labels:", train_dataset["label"])
print("Test dataset labels:", test_dataset["label"])

# Remove unnecessary columns from the datasets to keep only the required data
train_dataset = train_dataset.remove_columns(["text", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["text", "__index_level_0__"])

# Load the Pre-trained Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=df["label"].nunique())

# Define Training Arguments and # Set up the training parameters
training_args = TrainingArguments(
    # Where to save model outputs and checkpoints
    output_dir="./results",
    # Check model performance at the end of each training cycle
    evaluation_strategy="epoch",
    # Save the model after each training cycle
    save_strategy="epoch",
    # How fast the model learns
    learning_rate=5e-5,
    # Number of samples processed at once during training
    per_device_train_batch_size=16,
    # Number of samples processed at once during evaluation
    per_device_eval_batch_size=16,
    # Total number of times the model will see the training data
    num_train_epochs=3,
    # Regularization to prevent overfitting
    weight_decay=0.01,
    # Where to save logs of training progress
    logging_dir="./logs",
    # How often to log training metrics
    logging_steps=10,
    # Use the best model found during training at the end
    load_best_model_at_end=True,
    # Metric used to determine the best model
    metric_for_best_model="eval_loss",
)

# Function to compute evaluation metrics
def compute_metrics(eval_pred):
    # Unpack predictions and true labels
    logits, labels = eval_pred
    # Get predicted class indices
    preds = np.argmax(logits, axis=-1)
    # Calculate accuracy
    return {"accuracy": (preds == labels).mean()}

# Create a Trainer instance with the model, training settings, datasets, and metric function
trainer = Trainer(
    # The model to train
    model=model,
    # Training settings
    args=training_args,
    # Training data
    train_dataset=train_dataset,
    # Evaluation data
    eval_dataset=test_dataset,
    # Tokenizer for processing text
    tokenizer=tokenizer,
    # Function to calculate metrics
    compute_metrics=compute_metrics,
)

# Training the Model
trainer.train()

# Evaluating the Model
# Get predictions for the test data
predictions = trainer.predict(test_dataset)
# Determine predicted classes
preds = np.argmax(predictions.predictions, axis=-1)
print("Evaluation Metrics:\n", classification_report(test_dataset["label"], preds))

# Save the trained model and tokenizer for later use
model.save_pretrained("fine_tuned_bert")
tokenizer.save_pretrained("fine_tuned_bert")

# Using the Fine-Tuned Model for Inference
from transformers import pipeline

# Create a sentiment analysis pipeline with the fine-tuned model
sentiment_pipeline = pipeline("text-classification", model="fine_tuned_bert")

Message data =  12.0
Encrypted data =  3.0
Original Message Sent =  12.0

Index(['author', 'comment', 'published_at', 'cleaned_comment',
       'Sentiment_Num'],
      dtype='object')
Unique labels (after cleaning): [2 4 1 0 3]
Label data type: int32


Map:   0%|          | 0/606 [00:00<?, ? examples/s]

Train dataset labels: [4, 2, 2, 0, 1, 3, 2, 4, 2, 2, 2, 4, 2, 0, 1, 2, 3, 2, 3, 1, 1, 4, 0, 2, 0, 0, 3, 4, 2, 0, 0, 4, 0, 2, 3, 1, 4, 2, 2, 4, 4, 2, 2, 2, 2, 0, 2, 1, 4, 3, 4, 3, 3, 3, 2, 0, 4, 4, 4, 2, 2, 3, 2, 0, 2, 2, 0, 4, 2, 2, 3, 2, 4, 3, 4, 0, 2, 2, 2, 2, 4, 2, 2, 3, 3, 4, 0, 2, 4, 3, 2, 0, 2, 2, 1, 4, 4, 3, 0, 2, 2, 0, 2, 4, 2, 2, 4, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 4, 2, 0, 2, 2, 0, 4, 2, 4, 2, 2, 2, 3, 3, 2, 3, 2, 2, 4, 2, 0, 2, 2, 2, 2, 2, 1, 2, 4, 2, 2, 2, 2, 1, 4, 2, 2, 3, 0, 2, 2, 3, 2, 2, 2, 4, 3, 2, 0, 4, 0, 2, 3, 2, 4, 3, 4, 2, 2, 2, 4, 2, 2, 3, 0, 0, 4, 4, 3, 4, 2, 2, 2, 3, 2, 0, 0, 0, 2, 0, 2, 2, 1, 4, 2, 0, 4, 2, 2, 3, 0, 0, 3, 2, 0, 2, 3, 4, 4, 1, 0, 2, 0, 2, 0, 0, 2, 0, 1, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 0, 0, 3, 4, 2, 2, 0, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 4, 0, 2, 2, 2, 1, 2, 3, 2, 1, 2, 3, 4, 3, 4, 2, 3, 4, 2, 0, 2, 4, 1, 2, 0, 2, 4, 2, 2, 0, 4, 2, 2, 0, 1, 2, 0, 2, 2, 3, 3, 2, 2, 2, 0, 2, 2, 4, 2, 2, 3, 4, 2, 2, 2, 2, 2, 1, 2, 2, 0, 4, 0, 2, 2, 3, 2, 2, 2, 2, 4, 3, 0,

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/108 [00:00<?, ?it/s]

{'loss': 1.4589, 'grad_norm': 5.761514186859131, 'learning_rate': 4.5370370370370374e-05, 'epoch': 0.28}
{'loss': 1.4166, 'grad_norm': 5.267098426818848, 'learning_rate': 4.074074074074074e-05, 'epoch': 0.56}
{'loss': 1.3556, 'grad_norm': 7.86448335647583, 'learning_rate': 3.611111111111111e-05, 'epoch': 0.83}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 1.3161087036132812, 'eval_accuracy': 0.4838709677419355, 'eval_runtime': 18.1827, 'eval_samples_per_second': 1.705, 'eval_steps_per_second': 0.11, 'epoch': 1.0}
{'loss': 1.2809, 'grad_norm': 6.580168724060059, 'learning_rate': 3.148148148148148e-05, 'epoch': 1.11}
{'loss': 1.1678, 'grad_norm': 8.32638931274414, 'learning_rate': 2.6851851851851855e-05, 'epoch': 1.39}
{'loss': 1.0223, 'grad_norm': 6.178020477294922, 'learning_rate': 2.2222222222222223e-05, 'epoch': 1.67}
{'loss': 1.1319, 'grad_norm': 12.6719331741333, 'learning_rate': 1.7592592592592595e-05, 'epoch': 1.94}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 1.1971118450164795, 'eval_accuracy': 0.6129032258064516, 'eval_runtime': 18.3448, 'eval_samples_per_second': 1.69, 'eval_steps_per_second': 0.109, 'epoch': 2.0}
{'loss': 0.9895, 'grad_norm': 7.925891399383545, 'learning_rate': 1.2962962962962962e-05, 'epoch': 2.22}
{'loss': 0.8399, 'grad_norm': 6.837340831756592, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}
{'loss': 0.857, 'grad_norm': 7.451780796051025, 'learning_rate': 3.7037037037037037e-06, 'epoch': 2.78}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 1.2050241231918335, 'eval_accuracy': 0.6129032258064516, 'eval_runtime': 20.5858, 'eval_samples_per_second': 1.506, 'eval_steps_per_second': 0.097, 'epoch': 3.0}
{'train_runtime': 6285.3871, 'train_samples_per_second': 0.274, 'train_steps_per_second': 0.017, 'train_loss': 1.1221024107050013, 'epoch': 3.0}


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluation Metrics:
               precision    recall  f1-score   support

           0       0.50      0.67      0.57         6
           1       0.00      0.00      0.00         4
           2       0.72      0.93      0.81        14
           3       0.00      0.00      0.00         2
           4       0.40      0.40      0.40         5

    accuracy                           0.61        31
   macro avg       0.32      0.40      0.36        31
weighted avg       0.49      0.61      0.54        31



In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,pipeline
from sklearn.metrics import classification_report

# Load the pre-trained BERT model and tokenizer that were saved earlier
model_path = "fine_tuned_bert"  # Path to the saved fine-tuned model
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Load the New XLSX File with the actual testing data
new_file_path = "C:/Users/Anagha/IML_FinalProject/Youtube_Cleaned_Comments.xlsx"  
new_df = pd.read_excel(new_file_path)

# Ensure the DataFrame Has a 'text' Column
new_df = new_df.rename(columns={"cleaned_comment": "text"})

# Drop rows where 'text' is missing
new_df = new_df.dropna(subset=["text"])

# Using the Sentiment Pipeline for Predictions
# Create a sentiment analysis pipeline using the fine-tuned model and tokenizer
sentiment_pipeline = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    truncation=True,  # Add truncation to handle long sequences
    max_length=512,   # Set the maximum length of input sequences(BERT Requirement)
)

#Function to split long comments into smaller chunks
def split_long_comments(text, max_length=512):
    #Splits long text into chunks smaller than max_length tokens.
    tokens = tokenizer(text, truncation=False)["input_ids"]
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

# Apply splitting to long comments 
new_comments = new_df["text"].tolist()  # Convert the 'text' column to a list
# Convert the 'text' column to a list
processed_comments = []
# Check if comment exceeds the token limit
for comment in new_comments:
    if len(tokenizer(comment)["input_ids"]) > 512:  
        processed_comments.extend(split_long_comments(comment))  # Split long comments
    else:
        processed_comments.append(comment)  # Add short comments as is

# Get the sentiment predictions for the processed comments
results = sentiment_pipeline(processed_comments)

# Map results back to the original DataFrame
# Create a list to store predicted labels for each comment
predicted_labels = []
# Track the current index in the results list
current_index = 0

# Loop through each original comment to assign predictions
for comment in new_comments:
    if len(tokenizer(comment)["input_ids"]) > 512:
        split_chunks = split_long_comments(comment)
        # Get the predictions for these chunks from the results
        chunk_predictions = results[current_index:current_index + len(split_chunks)]
        # Combine chunk predictions 
        # Use majority voting to determine the final label for the long comment
        # Count occurrences of each label
        combined_label = max(set([pred["label"] for pred in chunk_predictions]), key=lambda x: [pred["label"] for pred in chunk_predictions].count(x))
        # Add the combined label to the predicted labels list
        predicted_labels.append(combined_label)
        current_index += len(split_chunks)
    else:
        predicted_labels.append(results[current_index]["label"])
        current_index += 1

# Add the predicted labels as a new column in the DataFrame
new_df["predicted_label"] = predicted_labels

#Saving the Results to a New Excel File
output_file_path = "C:/Users/Anagha/IML_FinalProject/Test_Data_Prediction.xlsx"  # Replace with desired output file path
new_df.to_excel(output_file_path, index=False)

# Print a preview of the DataFrame
print(new_df.head())


Token indices sequence length is longer than the specified maximum sequence length for this model (653 > 512). Running this sequence through the model will result in indexing errors


             author                                            comment  \
0     @guardiannews  For more context on this video:<br><a href="ht...   
1    @AndrejWatches  First time i see this.<br>Not buying gillette ...   
2      @user-rx162r                                               Hate   
3        @apall2764  Ah sh..., after 30 years of Gillette I need to...   
4  @yakovrokhlin316  I am done with Gillette razors. Switching to a...   

           published_at                                               text  \
0  2019-01-21T11:04:16Z                     for more context on this video   
1  2024-12-01T00:18:53Z  first time i see thisnot buying gillette ever ...   
2  2024-11-30T04:45:19Z                                               hate   
3  2024-11-29T14:15:02Z  ah sh after 30 years of gillette i need to fin...   
4  2024-11-29T12:56:26Z  i am done with gillette razors switching to an...   

  predicted_label  
0         LABEL_0  
1         LABEL_2  
2         LABEL_0  
3     