In [10]:
# import os

# proxy = "http://chnproxy.verizon.com:80"
# os.environ["HTTP_PROXY"] = proxy
# os.environ["HTTPS_PROXY"] = proxy

In [11]:
import pandas as pd
from transformers import pipeline
import numpy as np

# --- Configuration ---
FILE_NAME = r"C:\Users\JAIAA4P\Downloads\reviews - updated.csv"

# Load the dataset
try:
    df = pd.read_csv(FILE_NAME)
    print(f"✅ Successfully loaded {len(df)} reviews from '{FILE_NAME}'.")
except FileNotFoundError:
    print(f"❌ Error: The file '{FILE_NAME}' was not found. Please check the file path.")
    df = pd.DataFrame() # Create an empty DataFrame to avoid errors later

✅ Successfully loaded 864 reviews from 'C:\Users\JAIAA4P\Downloads\reviews - updated.csv'.


In [13]:
# 1. Prepare Text Data
if not df.empty:
    df['title'] = df['title'].astype(str).fillna('')
    df['content'] = df['content'].astype(str).fillna('')
    
    # Concatenate 'title' and 'content' into a single input field
    # A period '.' is used to clearly separate the title from the content.
    df['review_text'] = df['title'] + ". " + df['content']
    print("✅ Review text combined (Title + Content).")

    # 2. Initialize BERT Sentiment Analysis Pipeline
    # Using DistilBERT fine-tuned on SST-2 for binary (POSITIVE/NEGATIVE) classification.
    print("⏳ Initializing BERT model for sentiment analysis...")
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model="bert-base-uncased"
    )
    print("✅ BERT pipeline initialized.")
else:
    print("Skipping pipeline initialization as DataFrame is empty.")
    sentiment_pipeline = None # Define pipeline as None if DF is empty

✅ Review text combined (Title + Content).
⏳ Initializing BERT model for sentiment analysis...


ProxyError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by ProxyError(\'Unable to connect to proxy\', NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000018F2860B110>: Failed to resolve \'chnproxy.verizon.com\' ([Errno 11001] getaddrinfo failed)")))'), '(Request ID: ec92eca2-4106-4c86-aa95-2846a6462a19)')

In [None]:
# 1. Prepare Text Data
if not df.empty:
    df['title'] = df['title'].astype(str).fillna('')
    df['content'] = df['content'].astype(str).fillna('')
    
    # Concatenate 'title' and 'content' into a single input field
    # A period '.' is used to clearly separate the title from the content.
    df['review_text'] = df['title'] + ". " + df['content']
    print("✅ Review text combined (Title + Content).")

    # 2. Initialize BERT Sentiment Analysis Pipeline
    # Using DistilBERT fine-tuned on SST-2 for binary (POSITIVE/NEGATIVE) classification.
    print("⏳ Initializing BERT model for sentiment analysis...")
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model="distilbert-base-uncased-finetuned-sst-2-english"
    )
    print("✅ BERT pipeline initialized.")
else:
    print("Skipping pipeline initialization as DataFrame is empty.")
    sentiment_pipeline = None # Define pipeline as None if DF is empty

In [None]:
if sentiment_pipeline:
    print(f"⏳ Classifying {len(df)} reviews using the BERT model. This may take a moment...")
    
    # Run the classification. The pipeline handles batching automatically.
    results = sentiment_pipeline(df['review_text'].tolist())

    # Extract results into a DataFrame
    df_results = pd.DataFrame(results)
    df_results.rename(columns={'label': 'sentiment_label', 'score': 'sentiment_score'}, inplace=True)

    # Concatenate the new results back to the original DataFrame
    df = pd.concat([df.reset_index(drop=True), df_results], axis=1)
    
    print("✅ Classification complete and results integrated into DataFrame.")
else:
    print("Skipping classification as pipeline was not initialized.")

In [None]:
if 'sentiment_label' in df.columns:
    # 3. Calculate Percentages
    total_reviews = len(df)
    sentiment_counts = df['sentiment_label'].value_counts()
    
    positivity_percentage = sentiment_counts.get('POSITIVE', 0) / total_reviews * 100
    negativity_percentage = sentiment_counts.get('NEGATIVE', 0) / total_reviews * 100

    print("\n\n#########################################################")
    print("             FINAL SENTIMENT ANALYSIS RESULTS            ")
    print("#########################################################")
    print(f"Total Reviews Analyzed: {total_reviews}")
    print(f"Positive Reviews: {sentiment_counts.get('POSITIVE', 0)} ({positivity_percentage:.2f}%)")
    print(f"Negative Reviews: {sentiment_counts.get('NEGATIVE', 0)} ({negativity_percentage:.2f}%)")
    print("---------------------------------------------------------")
    
    # 4. Provide Example Classifications
    print("\n--- Example Classifications & Defining Reason ---")
    
    # Example 1: Most Confident Positive Review
    pos_ex = df[df['sentiment_label'] == 'POSITIVE'].sort_values(by='sentiment_score', ascending=False).head(1).iloc[0]
    print("\n**Top Positive Review:**")
    print(f"  Sentiment: {pos_ex['sentiment_label']} (Score: {pos_ex['sentiment_score']:.4f})")
    print(f"  Review: \"{pos_ex['review_text']}\"")
    print(f"  **WHY POSITIVE:** The classification is defined by the high probability score (close to 1.0) assigned by the BERT model, indicating strong confidence in the presence of positive sentiment in the text.")

    # Example 2: Most Confident Negative Review
    neg_ex = df[df['sentiment_label'] == 'NEGATIVE'].sort_values(by='sentiment_score', ascending=False).head(1).iloc[0]
    print("\n**Top Negative Review:**")
    print(f"  Sentiment: {neg_ex['sentiment_label']} (Score: {neg_ex['sentiment_score']:.4f})")
    print(f"  Review: \"{neg_ex['review_text']}\"")
    print(f"  **WHY NEGATIVE:** The classification is defined by the high probability score (close to 1.0) assigned to the 'NEGATIVE' label, demonstrating the model's certainty that the text contains high-valence negative or frustrating language.")

    # 5. Save Results
    output_file = "reviews_with_sentiment_bert.csv"
    df[['id', 'review_text', 'sentiment_label', 'sentiment_score']].to_csv(output_file, index=False)
    print(f"\n✅ Final classified data saved to {output_file}.")
else:
    print("Cannot perform final calculations as classification results were not generated.")