# Sentiment Analysis with Machine Learning

This notebook demonstrates how to use the Shameless Sentiment Analyser for:
1. Collecting data from Twitter using snscrape
2. Preprocessing and cleaning the data
3. Performing sentiment analysis using ML models
4. Visualizing and analyzing results

## Setup and Imports

In [None]:
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Import our modules
from sentiment_analyser.scraper import TwitterCollector, DataStorage
from sentiment_analyser.models import SentimentAnalyzer, TextPreprocessor
from sentiment_analyser.config import get_settings

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("✅ All imports successful!")

## Configuration

In [None]:
# Get settings
settings = get_settings()

# Configuration parameters
SEARCH_QUERY = "python programming"  # Change this to your search term
MAX_TWEETS = 100  # Number of tweets to collect
SINCE_DATE = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d")  # Last 7 days

print(f"Search Query: {SEARCH_QUERY}")
print(f"Max Tweets: {MAX_TWEETS}")
print(f"Since Date: {SINCE_DATE}")

## Step 1: Data Collection

Collect tweets using snscrape

In [None]:
# Initialize collector
collector = TwitterCollector(rate_limit=1.0)

# Collect tweets
print(f"Collecting tweets for query: '{SEARCH_QUERY}'...")
tweets = []

for tweet in collector.search(
    query=SEARCH_QUERY,
    limit=MAX_TWEETS,
    since=SINCE_DATE
):
    tweets.append(tweet.to_dict())
    
    # Progress update
    if len(tweets) % 20 == 0:
        print(f"Collected {len(tweets)} tweets...")

print(f"\n✅ Collected {len(tweets)} tweets!")

In [None]:
# Convert to DataFrame
df = pd.DataFrame(tweets)
df['date'] = pd.to_datetime(df['date'])

# Display sample
print("\nSample of collected tweets:")
df[['user', 'content', 'likes', 'retweets', 'date']].head()

## Step 2: Data Exploration

In [None]:
# Basic statistics
print("Dataset Statistics:")
print(f"Total tweets: {len(df)}")
print(f"Unique users: {df['username'].nunique()}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"\nEngagement stats:")
print(df[['likes', 'retweets', 'replies']].describe())

In [None]:
# Visualize tweet distribution over time
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Tweets over time
df.set_index('date').resample('D').size().plot(ax=axes[0, 0], kind='bar')
axes[0, 0].set_title('Tweets per Day')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Number of Tweets')

# Engagement distribution
axes[0, 1].hist(df['likes'], bins=30, alpha=0.7, label='Likes')
axes[0, 1].hist(df['retweets'], bins=30, alpha=0.7, label='Retweets')
axes[0, 1].set_title('Engagement Distribution')
axes[0, 1].set_xlabel('Count')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()

# Top users
top_users = df['username'].value_counts().head(10)
top_users.plot(ax=axes[1, 0], kind='barh')
axes[1, 0].set_title('Top 10 Most Active Users')
axes[1, 0].set_xlabel('Number of Tweets')

# Language distribution
df['language'].value_counts().head(10).plot(ax=axes[1, 1], kind='pie', autopct='%1.1f%%')
axes[1, 1].set_title('Language Distribution')

plt.tight_layout()
plt.show()

## Step 3: Text Preprocessing

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(
    lowercase=True,
    remove_urls=True,
    remove_mentions=False,
    remove_hashtags=False,
    remove_emojis=False
)

# Clean texts
df['clean_content'] = preprocessor.clean_batch(df['content'].tolist())

# Compare original vs cleaned
print("Original vs Cleaned Text Examples:\n")
for i in range(3):
    print(f"--- Example {i+1} ---")
    print(f"Original: {df.iloc[i]['content'][:100]}...")
    print(f"Cleaned:  {df.iloc[i]['clean_content'][:100]}...")
    print()

## Step 4: Sentiment Analysis

In [None]:
# Initialize sentiment analyzer
analyzer = SentimentAnalyzer(
    model_name="distilbert-base-uncased-finetuned-sst-2-english",
    device="cpu",
    preprocess=False  # We already preprocessed
)

print("🤖 Sentiment analyzer initialized!")

In [None]:
# Analyze sentiments in batch
print("Analyzing sentiments...")
sentiments = analyzer.analyze_batch(df['clean_content'].tolist(), batch_size=32)

# Add results to dataframe
df['sentiment'] = [s['sentiment'] for s in sentiments]
df['sentiment_score'] = [s['score'] for s in sentiments]
df['sentiment_label'] = [s['label'] for s in sentiments]

print("\n✅ Sentiment analysis complete!")
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

In [None]:
# Display sample results
print("\nSample Results:")
df[['content', 'sentiment', 'sentiment_score']].head(10)

## Step 5: Visualize Results

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Sentiment distribution
sentiment_counts = df['sentiment'].value_counts()
colors = {'positive': '#2ecc71', 'negative': '#e74c3c', 'neutral': '#95a5a6'}
sentiment_colors = [colors.get(s, '#3498db') for s in sentiment_counts.index]

axes[0, 0].pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%',
               colors=sentiment_colors, startangle=90)
axes[0, 0].set_title('Overall Sentiment Distribution', fontsize=14, fontweight='bold')

# Sentiment over time
df_time = df.set_index('date')
for sentiment in df['sentiment'].unique():
    sentiment_data = df_time[df_time['sentiment'] == sentiment]
    sentiment_by_day = sentiment_data.resample('D').size()
    axes[0, 1].plot(sentiment_by_day.index, sentiment_by_day.values, 
                    marker='o', label=sentiment.capitalize(), 
                    color=colors.get(sentiment, '#3498db'))

axes[0, 1].set_title('Sentiment Trend Over Time', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Number of Tweets')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Confidence score distribution
for sentiment in df['sentiment'].unique():
    sentiment_data = df[df['sentiment'] == sentiment]['sentiment_score']
    axes[1, 0].hist(sentiment_data, bins=20, alpha=0.6, label=sentiment.capitalize(),
                    color=colors.get(sentiment, '#3498db'))

axes[1, 0].set_title('Sentiment Confidence Score Distribution', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Confidence Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Sentiment vs Engagement
sentiment_engagement = df.groupby('sentiment').agg({
    'likes': 'mean',
    'retweets': 'mean',
    'replies': 'mean'
})

x = range(len(sentiment_engagement))
width = 0.25
axes[1, 1].bar([i - width for i in x], sentiment_engagement['likes'], 
               width, label='Likes', color='#3498db')
axes[1, 1].bar(x, sentiment_engagement['retweets'], 
               width, label='Retweets', color='#2ecc71')
axes[1, 1].bar([i + width for i in x], sentiment_engagement['replies'], 
               width, label='Replies', color='#e74c3c')

axes[1, 1].set_title('Average Engagement by Sentiment', fontsize=14, fontweight='bold')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(sentiment_engagement.index)
axes[1, 1].set_ylabel('Average Count')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Step 6: Insights and Analysis

In [None]:
# Most positive tweets
print("🌟 Most Positive Tweets:\n")
positive_tweets = df[df['sentiment'] == 'positive'].nlargest(5, 'sentiment_score')
for idx, row in positive_tweets.iterrows():
    print(f"Score: {row['sentiment_score']:.4f}")
    print(f"Tweet: {row['content'][:150]}...")
    print(f"By: @{row['username']} | Likes: {row['likes']} | Retweets: {row['retweets']}")
    print("-" * 80)
    print()

In [None]:
# Most negative tweets
print("⚠️  Most Negative Tweets:\n")
negative_tweets = df[df['sentiment'] == 'negative'].nlargest(5, 'sentiment_score')
for idx, row in negative_tweets.iterrows():
    print(f"Score: {row['sentiment_score']:.4f}")
    print(f"Tweet: {row['content'][:150]}...")
    print(f"By: @{row['username']} | Likes: {row['likes']} | Retweets: {row['retweets']}")
    print("-" * 80)
    print()

In [None]:
# Summary statistics
print("📊 Summary Statistics:\n")
print(f"Average sentiment score: {df['sentiment_score'].mean():.4f}")
print(f"Positive tweets: {(df['sentiment'] == 'positive').sum()} ({(df['sentiment'] == 'positive').sum()/len(df)*100:.1f}%)")
print(f"Negative tweets: {(df['sentiment'] == 'negative').sum()} ({(df['sentiment'] == 'negative').sum()/len(df)*100:.1f}%)")
print(f"Neutral tweets: {(df['sentiment'] == 'neutral').sum()} ({(df['sentiment'] == 'neutral').sum()/len(df)*100:.1f}%)")

print("\n💬 Engagement by Sentiment:")
print(df.groupby('sentiment')[['likes', 'retweets', 'replies']].mean().round(2))

## Step 7: Save Results

In [None]:
# Initialize storage
storage = DataStorage(settings.PROCESSED_DATA_DIR)

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"sentiment_analysis_{timestamp}"

# Save as CSV
csv_path = storage.save_csv(df.to_dict('records'), f"{filename}.csv")
print(f"✅ Results saved to: {csv_path}")

# Save as Parquet (more efficient)
parquet_path = storage.save_parquet(df.to_dict('records'), f"{filename}.parquet")
print(f"✅ Results saved to: {parquet_path}")

## Conclusion

This notebook demonstrated:
- ✅ Collecting tweets using snscrape
- ✅ Preprocessing and cleaning text data
- ✅ Performing sentiment analysis with ML models
- ✅ Visualizing and analyzing results
- ✅ Saving processed data

### Next Steps:
1. Try different search queries and hashtags
2. Experiment with different ML models
3. Add more advanced visualizations
4. Implement real-time monitoring
5. Build a dashboard for results