# Twitter Sentiment Analysis - Exploratory Data Analysis
## Analyzing 500K tweets for sentiment patterns

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Load processed tweets
df = pd.read_csv('../data/processed_tweets.csv')
print(f"Total tweets: {len(df)}")
df.head()

## 2. Data Overview

In [None]:
# Basic statistics
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

## 3. Sentiment Distribution

In [None]:
# Sentiment distribution
if 'sentiment' in df.columns:
    sentiment_counts = df['sentiment'].value_counts()
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
    plt.title('Sentiment Distribution', fontsize=16)
    plt.xlabel('Sentiment', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks([0, 1, 2], ['Negative', 'Neutral', 'Positive'])
    plt.show()
    
    print("\nSentiment distribution:")
    print(sentiment_counts)

## 4. Text Length Analysis

In [None]:
# Text length distribution
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Character length
axes[0].hist(df['text_length'], bins=50, edgecolor='black')
axes[0].set_title('Character Length Distribution', fontsize=14)
axes[0].set_xlabel('Characters')
axes[0].set_ylabel('Frequency')

# Word count
axes[1].hist(df['word_count'], bins=50, edgecolor='black', color='orange')
axes[1].set_title('Word Count Distribution', fontsize=14)
axes[1].set_xlabel('Words')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Average text length: {df['text_length'].mean():.2f} characters")
print(f"Average word count: {df['word_count'].mean():.2f} words")

## 5. Hashtag Analysis

In [None]:
# Top hashtags
if 'hashtags' in df.columns:
    hashtags = df['hashtags'].dropna()
    all_hashtags = []
    for tags in hashtags:
        all_hashtags.extend(str(tags).split(','))
    
    hashtag_freq = pd.Series(all_hashtags).value_counts().head(20)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=hashtag_freq.values, y=hashtag_freq.index)
    plt.title('Top 20 Hashtags', fontsize=16)
    plt.xlabel('Frequency', fontsize=12)
    plt.ylabel('Hashtag', fontsize=12)
    plt.tight_layout()
    plt.show()

## 6. Word Cloud

In [None]:
# Generate word cloud
if 'processed_text' in df.columns:
    text = ' '.join(df['processed_text'].dropna())
    
    wordcloud = WordCloud(width=1600, height=800, background_color='white', 
                         max_words=100, colormap='viridis').generate(text)
    
    plt.figure(figsize=(16, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - All Tweets', fontsize=20)
    plt.tight_layout(pad=0)
    plt.show()

## 7. Temporal Analysis

In [None]:
# Time-based analysis
if 'created_at' in df.columns:
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['date'] = df['created_at'].dt.date
    df['hour'] = df['created_at'].dt.hour
    
    # Tweets per day
    daily_tweets = df['date'].value_counts().sort_index()
    
    plt.figure(figsize=(14, 6))
    daily_tweets.plot()
    plt.title('Tweet Volume Over Time', fontsize=16)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Number of Tweets', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

## 8. Engagement Metrics

In [None]:
# Engagement analysis
if 'retweet_count' in df.columns and 'like_count' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Retweets
    axes[0].hist(df['retweet_count'], bins=50, edgecolor='black')
    axes[0].set_title('Retweet Count Distribution', fontsize=14)
    axes[0].set_xlabel('Retweets')
    axes[0].set_ylabel('Frequency')
    axes[0].set_yscale('log')
    
    # Likes
    axes[1].hist(df['like_count'], bins=50, edgecolor='black', color='red')
    axes[1].set_title('Like Count Distribution', fontsize=14)
    axes[1].set_xlabel('Likes')
    axes[1].set_ylabel('Frequency')
    axes[1].set_yscale('log')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Average retweets: {df['retweet_count'].mean():.2f}")
    print(f"Average likes: {df['like_count'].mean():.2f}")

## 9. Sentiment by Engagement

In [None]:
# Sentiment vs engagement
if 'sentiment' in df.columns and 'like_count' in df.columns:
    sentiment_engagement = df.groupby('sentiment')[['like_count', 'retweet_count']].mean()
    
    sentiment_engagement.plot(kind='bar', figsize=(10, 6))
    plt.title('Average Engagement by Sentiment', fontsize=16)
    plt.xlabel('Sentiment', fontsize=12)
    plt.ylabel('Average Count', fontsize=12)
    plt.xticks([0, 1, 2], ['Negative', 'Neutral', 'Positive'], rotation=0)
    plt.legend(['Likes', 'Retweets'])
    plt.tight_layout()
    plt.show()