# Cryptocurrency Sentiment Analysis Demo

This notebook demonstrates the statistical correlation analysis between Twitter sentiment and Bitcoin price movements using sample data from June 15, 2025.

## 1. Load Sample Data

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load sample datasets
tweets = pd.read_csv('sample_tweets.csv', parse_dates=['timestamp'])
prices = pd.read_csv('sample_prices.csv', parse_dates=['timestamp'])

print(f"Loaded {len(tweets)} tweets and {len(prices)} price records")
print(f"\nDate range: {tweets['timestamp'].min()} to {tweets['timestamp'].max()}")

## 2. Explore Sentiment Data

In [None]:
# Display sample tweets
print("Sample tweets with sentiment scores:\n")
print(tweets[['timestamp', 'handle', 'avg_score']].head(10))

# Sentiment statistics
print("\nSentiment Score Statistics:")
print(tweets['avg_score'].describe())
print(f"\nPositive tweets (score > 0): {(tweets['avg_score'] > 0).sum()}")
print(f"Negative tweets (score < 0): {(tweets['avg_score'] < 0).sum()}")
print(f"Neutral tweets (score = 0): {(tweets['avg_score'] == 0).sum()}")

## 3. Explore Price Data

In [None]:
# Display sample prices
print("Bitcoin hourly prices:\n")
print(prices[['timestamp', 'price', 'volume_24h']].head(10))

# Price statistics
print("\nPrice Statistics:")
print(prices['price'].describe())
print(f"\nPrice range: ${prices['price'].min():.2f} - ${prices['price'].max():.2f}")
print(f"Price volatility (std dev): ${prices['price'].std():.2f}")

## 4. Aggregate Sentiment by Hour

To correlate with hourly price data, we aggregate multiple tweets within each hour into a single sentiment score.

In [None]:
# Round timestamps to nearest hour
tweets['hour'] = tweets['timestamp'].dt.floor('H')
prices['hour'] = prices['timestamp'].dt.floor('H')

# Aggregate sentiment by hour (mean)
hourly_sentiment = tweets.groupby('hour').agg({
    'avg_score': 'mean',
    'handle': 'count'
}).rename(columns={'handle': 'tweet_count', 'avg_score': 'hourly_sentiment'})

print(f"Aggregated into {len(hourly_sentiment)} hourly sentiment values")
print("\nHourly sentiment samples:")
print(hourly_sentiment.head(10))

## 5. Merge Sentiment and Price Data

In [None]:
# Merge on hour
merged = pd.merge(
    hourly_sentiment,
    prices[['hour', 'price', 'volume_24h']],
    on='hour',
    how='inner'
)

print(f"Merged dataset: {len(merged)} records with both sentiment and price")
print("\nMerged data sample:")
print(merged.head(10))

## 6. Calculate Pearson Correlation

Implementation of Pearson correlation coefficient from first principles:

$$r = \frac{\sum_{i=1}^{n}(x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum_{i=1}^{n}(x_i - \bar{x})^2 \cdot \sum_{i=1}^{n}(y_i - \bar{y})^2}}$$

In [None]:
def calculate_pearson_correlation(x, y):
    """
    Calculate Pearson correlation coefficient manually.
    
    Args:
        x, y: Arrays of equal length
    
    Returns:
        float: Pearson correlation coefficient (-1 to +1)
    """
    n = len(x)
    
    # Calculate means
    mean_x = sum(x) / n
    mean_y = sum(y) / n
    
    # Calculate deviations and products
    sum_xy = sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(n))
    sum_x2 = sum((x[i] - mean_x) ** 2 for i in range(n))
    sum_y2 = sum((y[i] - mean_y) ** 2 for i in range(n))
    
    # Calculate correlation
    denominator = (sum_x2 * sum_y2) ** 0.5
    
    if denominator == 0:
        return 0
    
    correlation = sum_xy / denominator
    return correlation

# Extract arrays
sentiment_array = merged['hourly_sentiment'].values
price_array = merged['price'].values

# Calculate correlation
r = calculate_pearson_correlation(sentiment_array, price_array)
r_squared = r ** 2

print(f"Pearson Correlation (r): {r:.4f}")
print(f"Coefficient of Determination (R²): {r_squared:.4f}")
print(f"\nInterpretation: {abs(r_squared * 100):.2f}% of price variance is explained by sentiment")

# Verify with pandas
pandas_r = merged['hourly_sentiment'].corr(merged['price'])
print(f"\nVerification (pandas): {pandas_r:.4f}")
print(f"Difference: {abs(r - pandas_r):.10f} (should be near zero)")

## 7. Statistical Interpretation

Classification of correlation strength based on |r| value:

In [None]:
def interpret_correlation(r):
    """Classify correlation strength."""
    abs_r = abs(r)
    
    if abs_r < 0.3:
        return "Very weak"
    elif abs_r < 0.5:
        return "Weak"
    elif abs_r < 0.7:
        return "Moderate"
    elif abs_r < 0.9:
        return "Strong"
    else:
        return "Very strong"

classification = interpret_correlation(r)
direction = "positive" if r > 0 else "negative"

print(f"Correlation Classification: {classification} {direction}")
print(f"\nConclusion:")
print(f"This {classification.lower()} correlation suggests that Twitter sentiment")
print(f"has minimal predictive power for Bitcoin price movements in this sample.")
print(f"Only {r_squared * 100:.1f}% of price variance can be explained by sentiment.")

## 8. Directional Movement Analysis

Calculate what percentage of the time sentiment and price move in the same direction.

In [None]:
# Calculate changes (deltas)
merged_sorted = merged.sort_values('hour').reset_index(drop=True)
merged_sorted['sentiment_change'] = merged_sorted['hourly_sentiment'].diff()
merged_sorted['price_change'] = merged_sorted['price'].diff()

# Remove first row (NaN from diff)
changes = merged_sorted.dropna()

# Check if changes have same sign
same_direction = (
    (changes['sentiment_change'] > 0) & (changes['price_change'] > 0)
) | (
    (changes['sentiment_change'] < 0) & (changes['price_change'] < 0)
)

same_direction_pct = (same_direction.sum() / len(changes)) * 100

print(f"Directional Analysis:")
print(f"Same direction: {same_direction.sum()} / {len(changes)} ({same_direction_pct:.1f}%)")
print(f"Opposite direction: {(~same_direction).sum()} / {len(changes)} ({100 - same_direction_pct:.1f}%)")
print(f"\nRandom chance would be 50%. Observed: {same_direction_pct:.1f}%")

## 9. Event Detection

Identify sentiment spikes and measure subsequent price impact.

In [None]:
# Define sentiment spike threshold (1 standard deviation)
sentiment_mean = merged['hourly_sentiment'].mean()
sentiment_std = merged['hourly_sentiment'].std()
threshold = sentiment_std

print(f"Sentiment Statistics:")
print(f"Mean: {sentiment_mean:.4f}")
print(f"Std Dev: {sentiment_std:.4f}")
print(f"Threshold for spikes: ±{threshold:.4f}")

# Detect positive and negative sentiment events
merged_sorted['is_positive_spike'] = merged_sorted['hourly_sentiment'] > (sentiment_mean + threshold)
merged_sorted['is_negative_spike'] = merged_sorted['hourly_sentiment'] < (sentiment_mean - threshold)

positive_spikes = merged_sorted[merged_sorted['is_positive_spike']]
negative_spikes = merged_sorted[merged_sorted['is_negative_spike']]

print(f"\nDetected Events:")
print(f"Positive sentiment spikes: {len(positive_spikes)}")
print(f"Negative sentiment spikes: {len(negative_spikes)}")

if len(positive_spikes) > 0:
    print(f"\nPositive spike examples:")
    print(positive_spikes[['hour', 'hourly_sentiment', 'price']])

if len(negative_spikes) > 0:
    print(f"\nNegative spike examples:")
    print(negative_spikes[['hour', 'hourly_sentiment', 'price']])

## 10. Summary

Key findings from this sample analysis:

In [None]:
print("="*60)
print("ANALYSIS SUMMARY")
print("="*60)
print(f"\nDataset:")
print(f"  - Tweets analyzed: {len(tweets)}")
print(f"  - Price records: {len(prices)}")
print(f"  - Merged hourly records: {len(merged)}")
print(f"  - Date: June 15, 2025")

print(f"\nStatistical Results:")
print(f"  - Pearson correlation (r): {r:.4f}")
print(f"  - R² (variance explained): {r_squared:.4f} ({r_squared * 100:.2f}%)")
print(f"  - Classification: {classification} {direction}")
print(f"  - Same direction movements: {same_direction_pct:.1f}%")

print(f"\nConclusion:")
print(f"  Twitter sentiment shows a {classification.lower()} correlation with Bitcoin price.")
print(f"  Sentiment explains only {r_squared * 100:.1f}% of price variance, suggesting")
print(f"  limited predictive power for short-term price movements.")
print("\n" + "="*60)

print("\nNote: This is a sample analysis with limited data (1 day).")
print("Full research used 4,441 hourly records showing similar weak correlations.")