In [None]:
# import os

# proxy = "http://chnproxy.verizon.com:80"
# os.environ["HTTP_PROXY"] = proxy
# os.environ["HTTPS_PROXY"] = proxy

In [4]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import classification_report
import numpy as np

# --- Configuration ---
FILE_NAME = r"/content/reviews - updated.csv"

# Load the dataset
df = pd.read_csv(FILE_NAME)

In [5]:
# 1. Prepare Text Data
'''
  Prepares the text data for sentiment analysis by combining the 'title' and 'content' columns.
  Args:
        df (pd.DataFrame): The input DataFrame, expected to contain 'title' and 'content' columns.

    Returns:
        pd.DataFrame: The modified DataFrame including the new 'review_text' column.
                      Returns an empty DataFrame if the input DataFrame is empty.
'''
if not df.empty:

    # Concatenate 'title' and 'content' into a single input field
    # A period '.' is used to clearly separate the title from the content.
    df['review_text'] = df['title'] + ". " + df['content']
    print("Review text combined (Title + Content).")

    # 2. Initialize BERT Sentiment Analysis Pipeline
    # Using DistilBERT fine-tuned on SST-2 for binary (POSITIVE/NEGATIVE) classification.
    print("Initializing BERT model for sentiment analysis...")
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model="distilbert-base-uncased-finetuned-sst-2-english"
    )
    print("BERT pipeline initialized.")
else:
    print("Skipping pipeline initialization as DataFrame is empty.")
    sentiment_pipeline = None # Define pipeline as None if DF is empty

Review text combined (Title + Content).
Initializing BERT model for sentiment analysis...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


BERT pipeline initialized.


In [6]:
'''
Applies the BERT sentiment analysis pipeline to a DataFrame of reviews to predict sentiment.

Args:
        df (pd.DataFrame): The DataFrame containing the 'review_text' column to be classified.
                           The classification process handles long reviews by using truncation=True.
        sentiment_pipeline (Optional[pipeline]): The initialized Hugging Face sentiment
                                                 analysis pipeline object. This will be None
                                                 if initialization failed or was skipped.

    Returns:
        pd.DataFrame: The original DataFrame augmented with the 'sentiment_label' and
                      'sentiment_score' columns. Returns the original DataFrame unmodified
                      if the pipeline was not initialized.
'''
if sentiment_pipeline:
    print(f"Classifying {len(df)} reviews using the BERT model. This may take a moment...")

    # Run the classification. The pipeline handles batching automatically.
    results = sentiment_pipeline(df['review_text'].tolist(), truncation=True)

    # Extract results into a DataFrame
    df_results = pd.DataFrame(results)
    df_results.rename(columns={'label': 'sentiment_label', 'score': 'sentiment_score'}, inplace=True)

    # Concatenate the new results back to the original DataFrame
    df = pd.concat([df.reset_index(drop=True), df_results], axis=1)

    print("Classification complete and results integrated into DataFrame.")
else:
    print("Skipping classification as pipeline was not initialized.")

Classifying 864 reviews using the BERT model. This may take a moment...
Classification complete and results integrated into DataFrame.


In [8]:
'''
Calculates the distribution of sentiment labels, prints a final summary report,
and saves the classified review data to a CSV file.

Args:
        df (pd.DataFrame): The DataFrame containing the classification results. It must
                           include 'id', 'review_text', 'sentiment_label', and
                           'sentiment_score' columns.
        output_file (str, optional): The filename for saving the classified data.
                                     Defaults to 'reviews_with_sentiment_bert.csv'.

    Returns:
        None: The function performs printing and file saving operations directly.
'''

if 'sentiment_label' in df.columns:
    # 3. Calculate Percentages
    total_reviews = len(df)
    sentiment_counts = df['sentiment_label'].value_counts()

    print("\n\n#########################################################")
    print("             FINAL SENTIMENT ANALYSIS RESULTS            ")
    print("#########################################################")
    print(f"Positive Reviews: {sentiment_counts.get('POSITIVE', 0)}")
    print(f"Negative Reviews: {sentiment_counts.get('NEGATIVE', 0)}")
    print("---------------------------------------------------------")

    # 4. Save Results
    output_file = "reviews_with_sentiment_bert.csv"
    df[['id', 'review_text', 'sentiment_label', 'sentiment_score']].to_csv(output_file, index=False)
    print(f"\n Final classified data saved to {output_file}.")
else:
    print("Cannot perform final calculations as classification results were not generated.")



#########################################################
             FINAL SENTIMENT ANALYSIS RESULTS            
#########################################################
Positive Reviews: 260
Negative Reviews: 604
---------------------------------------------------------

 Final classified data saved to reviews_with_sentiment_bert.csv.


In [10]:
'''
Evaluates the model's performance against a ground truth dataset and prints a
    scikit-learn classification report.

Args:
        sentiment_pipeline (Optional[pipeline]): The initialized Hugging Face sentiment
                                                 analysis pipeline object.
        ground_truth_file (str): The full path to the CSV file containing the ground truth data.
        label_column (str): The name of the column in the ground truth file that holds the
                            correct sentiment labels (e.g., 'Ground Truth review').

    Returns:
        None: The function prints the classification report directly to the console.
'''

# --- Configuration for Ground Truth File ---
# NOTE: Replace this placeholder path with the actual path/name of your new uploaded CSV file.
GROUND_TRUTH_FILE_NAME = r"/content/Aarjav-setiment-reviews - reviews.csv"
GROUND_TRUTH_LABEL_COLUMN = 'Ground Truth review' # <<-- ADJUST THIS TO YOUR GROUND TRUTH COLUMN NAME!

try:
    # 1. Load the ground truth dataset
    df_test = pd.read_csv(GROUND_TRUTH_FILE_NAME)
    print(f"Successfully loaded {len(df_test)} reviews from '{GROUND_TRUTH_FILE_NAME}'.")

    if not df_test.empty:
        # Assuming the ground truth file has a 'review_text' column like the training data

        # 2. Extract predictions
        df_test_results = pd.DataFrame(results)
        df_test['predicted_label'] = df_test_results['label']

        # 3. Prepare labels for comparison (Sklearn requires consistent formatting)
        # Convert all labels to uppercase to ensure consistency
        df_test['predicted_label_upper'] = df_test['predicted_label'].str.upper()
        df_test['ground_truth_label_upper'] = df_test[GROUND_TRUTH_LABEL_COLUMN].astype(str).str.upper()

        # Ensure labels are binary (e.g., POSITIVE and NEGATIVE)
        valid_labels = ['POSITIVE', 'NEGATIVE']

        # Filter out rows where ground truth label isn't one of the expected labels
        df_test_filtered = df_test[df_test['ground_truth_label_upper'].isin(valid_labels)].copy()

        y_true = df_test_filtered['ground_truth_label_upper']
        y_pred = df_test_filtered['predicted_label_upper']

        # 5. Calculate and print performance metrics
        print("\n\n#########################################################")
        print("          BERT MODEL PERFORMANCE (GROUND TRUTH)          ")
        print("#########################################################")
        print("\nClassification Report:\n")
        print(classification_report(y_true, y_pred, digits=4))
        print("---------------------------------------------------------")

except FileNotFoundError:
    print(f" Error: The ground truth file '{GROUND_TRUTH_FILE_NAME}' was not found. Please check the file path and ensure it's uploaded.")
except KeyError as e:
    print(f" Error: Column {e} not found in the ground truth file. Check the column names, especially '{GROUND_TRUTH_LABEL_COLUMN}'.")
except Exception as e:
    print(f" An unexpected error occurred: {e}")

Successfully loaded 864 reviews from '/content/Aarjav-setiment-reviews - reviews.csv'.


#########################################################
          BERT MODEL PERFORMANCE (GROUND TRUTH)          
#########################################################

Classification Report:

              precision    recall  f1-score   support

    NEGATIVE     0.9636    0.9604    0.9620       606
    POSITIVE     0.9077    0.9147    0.9112       258

    accuracy                         0.9468       864
   macro avg     0.9356    0.9376    0.9366       864
weighted avg     0.9469    0.9468    0.9468       864

---------------------------------------------------------
