# Streamlined Sentiment Analysis & Summary
This notebook focuses on sentiment analysis and summary generation using a pre-trained RoBERTa model.
No training required - uses cardiffnlp/twitter-roberta-base-sentiment-latest model.

## 1. Setup and Load Pre-trained Model

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Initialize pre-trained RoBERTa sentiment model
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

print(f"Loading pre-trained model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Create sentiment pipeline
sentiment_pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    device=0 if torch.cuda.is_available() else -1
)

# Display label mappings
labels = model.config.id2label
print(f"\nModel loaded successfully!")
print(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
print(f"Label mappings: {labels}")

Loading pre-trained model: cardiffnlp/twitter-roberta-base-sentiment-latest


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



Model loaded successfully!
Device: CPU
Label mappings: {0: 'negative', 1: 'neutral', 2: 'positive'}


## 2. Load Data

In [None]:
# Load your data from CSV
# Adjust the path and column names as needed
csv_path = "1k_sample.csv"  # Change to your file: twitter_sentiment_data.csv, 5k_sample.csv, etc.

df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} records from {csv_path}")
print(f"\nDataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

# Automatically detect the text column
text_column = None
for col in df.columns:
    if col.lower() in ['text', 'tweet', 'comment', 'message', 'content', 'review']:
        text_column = col
        break

if text_column is None:
    # Use the first column that appears to contain text
    for col in df.columns:
        if df[col].dtype == 'object':
            text_column = col
            break

print(f"\nUsing column '{text_column}' for sentiment analysis")
print(f"\nFirst few rows:")
df.head()

Loaded 890 records from 1k_sample.csv

Dataset shape: (890, 3)

Columns: ['sentiment', 'message', 'tweetid']

First few rows:


Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


## 3. Perform Sentiment Analysis

In [4]:
# Test on a few samples first
sample_texts = df[text_column].head(5).tolist()
print("Testing on sample texts:\n")

for i, text in enumerate(sample_texts, 1):
    result = sentiment_pipeline(text, top_k=1)
    print(f"{i}. Text: {text[:80]}...")
    print(f"   Sentiment: {result[0]['label']} (confidence: {result[0]['score']:.3f})\n")

KeyError: 'text'

In [None]:
# Analyze all texts in batches for efficiency
print("Analyzing all texts...")

batch_size = 32
all_texts = df[text_column].fillna("").tolist()
all_results = []

for i in range(0, len(all_texts), batch_size):
    batch = all_texts[i:i+batch_size]
    results = sentiment_pipeline(batch, top_k=1, truncation=True, max_length=512)
    all_results.extend(results)
    if (i // batch_size + 1) % 10 == 0:
        print(f"Processed {i+len(batch)}/{len(all_texts)} texts...")

print(f"\nAnalysis complete! Processed {len(all_results)} texts.")

In [None]:
# Add predictions to dataframe
df['predicted_sentiment'] = [result[0]['label'] for result in all_results]
df['confidence_score'] = [result[0]['score'] for result in all_results]

print("Predictions added to dataframe!")
print(f"\nSample results:")
df[[text_column, 'predicted_sentiment', 'confidence_score']].head(10)

## 4. Generate Summary Statistics

In [None]:
# Overall sentiment distribution
sentiment_counts = df['predicted_sentiment'].value_counts()
sentiment_percentages = df['predicted_sentiment'].value_counts(normalize=True) * 100

print("=" * 60)
print("SENTIMENT ANALYSIS SUMMARY")
print("=" * 60)
print(f"\nTotal texts analyzed: {len(df)}")
print(f"\n{'Sentiment':<15} {'Count':<10} {'Percentage':<10}")
print("-" * 40)
for sentiment in sentiment_counts.index:
    count = sentiment_counts[sentiment]
    pct = sentiment_percentages[sentiment]
    print(f"{sentiment:<15} {count:<10} {pct:>6.2f}%")

print("\n" + "=" * 60)

In [None]:
# Confidence statistics
print("\nCONFIDENCE SCORE STATISTICS")
print("=" * 60)
print(f"Mean confidence: {df['confidence_score'].mean():.4f}")
print(f"Median confidence: {df['confidence_score'].median():.4f}")
print(f"Min confidence: {df['confidence_score'].min():.4f}")
print(f"Max confidence: {df['confidence_score'].max():.4f}")
print(f"Std deviation: {df['confidence_score'].std():.4f}")

# Confidence by sentiment
print("\nAverage confidence by sentiment:")
for sentiment in df['predicted_sentiment'].unique():
    avg_conf = df[df['predicted_sentiment'] == sentiment]['confidence_score'].mean()
    print(f"  {sentiment}: {avg_conf:.4f}")

## 5. Visualizations

In [None]:
# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: Sentiment distribution (bar chart)
sentiment_counts.plot(kind='bar', ax=axes[0], color=['#d62728', '#7f7f7f', '#2ca02c'])
axes[0].set_title('Sentiment Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sentiment', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].tick_params(axis='x', rotation=0)

# Add count labels on bars
for i, v in enumerate(sentiment_counts):
    axes[0].text(i, v + 0.5, str(v), ha='center', va='bottom', fontweight='bold')

# Plot 2: Sentiment distribution (pie chart)
colors = ['#d62728', '#7f7f7f', '#2ca02c']
sentiment_order = ['negative', 'neutral', 'positive']
plot_data = [sentiment_counts.get(s, 0) for s in sentiment_order]
axes[1].pie(plot_data, labels=sentiment_order, autopct='%1.1f%%', 
            startangle=90, colors=colors, textprops={'fontsize': 11})
axes[1].set_title('Sentiment Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Plot confidence score distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Overall confidence distribution
axes[0].hist(df['confidence_score'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title('Confidence Score Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Confidence Score', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].axvline(df['confidence_score'].mean(), color='red', linestyle='--', 
                linewidth=2, label=f"Mean: {df['confidence_score'].mean():.3f}")
axes[0].legend()

# Confidence by sentiment (box plot)
df.boxplot(column='confidence_score', by='predicted_sentiment', ax=axes[1])
axes[1].set_title('Confidence Score by Sentiment', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Sentiment', fontsize=12)
axes[1].set_ylabel('Confidence Score', fontsize=12)
plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

## 6. Detailed Analysis

In [None]:
# Show most confident predictions for each sentiment
print("MOST CONFIDENT PREDICTIONS BY SENTIMENT")
print("=" * 80)

for sentiment in ['positive', 'neutral', 'negative']:
    print(f"\n{sentiment.upper()}:")
    print("-" * 80)
    subset = df[df['predicted_sentiment'] == sentiment].nlargest(3, 'confidence_score')
    for idx, (_, row) in enumerate(subset.iterrows(), 1):
        text = row[text_column][:100] + "..." if len(row[text_column]) > 100 else row[text_column]
        print(f"{idx}. {text}")
        print(f"   Confidence: {row['confidence_score']:.4f}\n")

In [None]:
# Show least confident predictions (potential edge cases)
print("\nLEAST CONFIDENT PREDICTIONS (Potential Edge Cases)")
print("=" * 80)

least_confident = df.nsmallest(5, 'confidence_score')
for idx, (_, row) in enumerate(least_confident.iterrows(), 1):
    text = row[text_column][:100] + "..." if len(row[text_column]) > 100 else row[text_column]
    print(f"{idx}. Text: {text}")
    print(f"   Predicted: {row['predicted_sentiment']} (confidence: {row['confidence_score']:.4f})\n")

## 7. Export Results

In [None]:
# Save results to CSV
output_file = "sentiment_analysis_results.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

# Create summary report
summary_file = "sentiment_summary.txt"
with open(summary_file, 'w') as f:
    f.write("SENTIMENT ANALYSIS SUMMARY REPORT\n")
    f.write("=" * 60 + "\n\n")
    f.write(f"Model: {MODEL_NAME}\n")
    f.write(f"Total texts analyzed: {len(df)}\n\n")
    f.write("Sentiment Distribution:\n")
    f.write("-" * 40 + "\n")
    for sentiment in sentiment_counts.index:
        count = sentiment_counts[sentiment]
        pct = sentiment_percentages[sentiment]
        f.write(f"  {sentiment}: {count} ({pct:.2f}%)\n")
    f.write("\nConfidence Statistics:\n")
    f.write("-" * 40 + "\n")
    f.write(f"  Mean: {df['confidence_score'].mean():.4f}\n")
    f.write(f"  Median: {df['confidence_score'].median():.4f}\n")
    f.write(f"  Std Dev: {df['confidence_score'].std():.4f}\n")

print(f"Summary report saved to {summary_file}")
print("\nAnalysis complete!")