In [1]:
#!pip install praw


In [2]:
import praw
import pandas as pd
from datetime import datetime, timedelta
import re
import time
import json
import os
from collections import defaultdict

# Try to import optional packages
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    VADER_AVAILABLE = True
except ImportError:
    print("❌ VADER not installed. Please install: pip install vaderSentiment")
    VADER_AVAILABLE = False

try:
    from textblob import TextBlob
    TEXTBLOB_AVAILABLE = True
except ImportError:
    print("⚠️  TextBlob not installed. Using VADER only.")
    TEXTBLOB_AVAILABLE = False

class HistoricalRealMadridSentiment:
    """
    Historical weekly sentiment analysis for Real Madrid fans from season start
    Covers La Liga (August-May) and Champions League (September-June)
    """
    
    def __init__(self, reddit_config):
        """Initialize Reddit API connection"""
        
        self.reddit = praw.Reddit(
            client_id=reddit_config['client_id'],
            client_secret=reddit_config['client_secret'],
            user_agent=reddit_config['user_agent']
        )
        
        # Initialize sentiment analyzers
        if VADER_AVAILABLE:
            self.vader_analyzer = SentimentIntensityAnalyzer()
            print("✅ VADER sentiment analyzer loaded")
        else:
            self.vader_analyzer = None
            print("❌ VADER not available")
        
        # Real Madrid keywords
        self.rm_keywords = [
            'real madrid', 'madrid', 'bernabeu', 'bernabéu', 'hala madrid',
            'vinicius', 'bellingham', 'mbappe', 'modric', 'courtois',
            'ancelotti', 'los blancos', 'madridista', 'santiago bernabeu',
            'la liga', 'champions league', 'ucl', 'laliga'
        ]
        
        # Season dates (2024-25 season)
        self.season_start = datetime(2024, 8, 1)  # La Liga prep
        self.season_end = datetime(2025, 6, 30)   # Champions League final
        
        # Competition periods
        self.competitions = {
            'la_liga': {
                'start': datetime(2024, 8, 15),   # La Liga starts mid-August
                'end': datetime(2025, 5, 25),     # La Liga ends late May
                'keywords': ['la liga', 'laliga', 'league', 'primera']
            },
            'champions_league': {
                'start': datetime(2024, 9, 15),   # Champions League starts mid-September
                'end': datetime(2025, 6, 1),      # Champions League final
                'keywords': ['champions league', 'ucl', 'european cup', 'champions']
            },
            'copa_del_rey': {
                'start': datetime(2024, 11, 1),   # Copa del Rey starts November
                'end': datetime(2025, 4, 30),     # Copa del Rey final
                'keywords': ['copa del rey', 'copa', 'cup']
            }
        }
    
    def get_season_weeks(self):
        """Generate all weeks from season start to current date"""
        
        weeks = []
        current_date = self.season_start
        today = datetime.now()
        
        week_number = 1
        
        while current_date < today and current_date < self.season_end:
            week_end = current_date + timedelta(days=6)
            
            # Don't go beyond today
            if week_end > today:
                week_end = today
            
            # Determine active competitions for this week
            active_competitions = []
            for comp_name, comp_info in self.competitions.items():
                if comp_info['start'] <= current_date <= comp_info['end']:
                    active_competitions.append(comp_name)
            
            week_info = {
                'week_number': week_number,
                'start_date': current_date,
                'end_date': week_end,
                'active_competitions': active_competitions,
                'is_complete': week_end < today
            }
            
            weeks.append(week_info)
            
            # Move to next week
            current_date = week_end + timedelta(days=1)
            week_number += 1
        
        return weeks
    
    def collect_weekly_data(self, week_info, min_score=3, min_comments=2):
        """
        Collect Reddit data for a specific week
        
        Args:
            week_info (dict): Week information from get_season_weeks()
            min_score (int): Minimum post score
            min_comments (int): Minimum comments
        """
        
        print(f"\n📅 Week {week_info['week_number']}: {week_info['start_date'].strftime('%Y-%m-%d')} to {week_info['end_date'].strftime('%Y-%m-%d')}")
        print(f"🏆 Active competitions: {', '.join(week_info['active_competitions']) if week_info['active_competitions'] else 'Off-season'}")
        
        # Subreddits to search
        subreddits = ['realmadrid', 'soccer', 'LaLiga']
        all_posts = []
        
        # Convert to timestamps for Reddit API
        start_timestamp = int(week_info['start_date'].timestamp())
        end_timestamp = int(week_info['end_date'].timestamp())
        
        for subreddit_name in subreddits:
            print(f"   📍 Searching r/{subreddit_name}...")
            
            try:
                subreddit = self.reddit.subreddit(subreddit_name)
                
                # Search through different post types
                post_sources = []
                
                # Get new posts (more likely to be from the specific week)
                try:
                    post_sources.extend(list(subreddit.new(limit=200)))
                except:
                    pass
                
                # Get hot posts
                try:
                    post_sources.extend(list(subreddit.hot(limit=100)))
                except:
                    pass
                
                # Filter posts by date and relevance
                week_posts = []
                
                for post in post_sources:
                    try:
                        post_time = datetime.fromtimestamp(post.created_utc)
                        
                        # Check if post is in our week
                        if not (week_info['start_date'] <= post_time <= week_info['end_date']):
                            continue
                        
                        # Check reliability filters
                        if post.score < min_score or post.num_comments < min_comments:
                            continue
                        
                        # Check Real Madrid relevance
                        post_text = (post.title + ' ' + post.selftext).lower()
                        if not any(keyword in post_text for keyword in self.rm_keywords):
                            continue
                        
                        # Determine competition context
                        competition_context = self._detect_competition_context(
                            post_text, week_info['active_competitions']
                        )
                        
                        # Extract post data
                        post_data = {
                            'week_number': week_info['week_number'],
                            'week_start': week_info['start_date'],
                            'week_end': week_info['end_date'],
                            'post_id': post.id,
                            'subreddit': subreddit_name,
                            'title': post.title,
                            'selftext': post.selftext,
                            'score': post.score,
                            'upvote_ratio': post.upvote_ratio,
                            'num_comments': post.num_comments,
                            'created_utc': post_time,
                            'author': str(post.author) if post.author else 'deleted',
                            'permalink': f"https://reddit.com{post.permalink}",
                            'competition_context': competition_context,
                            'active_competitions': week_info['active_competitions'],
                            'reliability_score': self._calculate_reliability_score(post)
                        }
                        
                        # Get comments
                        comments = self._extract_comments(post, limit=5)
                        post_data['comments'] = comments
                        post_data['comment_count'] = len(comments)
                        
                        week_posts.append(post_data)
                        
                    except Exception as e:
                        print(f"     ⚠️ Error processing post: {e}")
                        continue
                
                all_posts.extend(week_posts)
                print(f"     ✅ Found {len(week_posts)} relevant posts")
                
                # Rate limiting
                time.sleep(0.2)
                
            except Exception as e:
                print(f"     ❌ Error accessing r/{subreddit_name}: {e}")
                continue
        
        print(f"📊 Week {week_info['week_number']} total: {len(all_posts)} posts")
        return all_posts
    
    def _detect_competition_context(self, text, active_competitions):
        """Detect which competition the post is about"""
        
        detected = []
        
        for comp_name in active_competitions:
            comp_keywords = self.competitions[comp_name]['keywords']
            if any(keyword in text for keyword in comp_keywords):
                detected.append(comp_name)
        
        # If no specific competition detected, mark as general
        if not detected and active_competitions:
            return ['general']
        elif not detected:
            return ['off_season']
        
        return detected
    
    def _calculate_reliability_score(self, post):
        """Calculate reliability score based on Reddit metrics"""
        
        # Base score from upvotes
        score_factor = min(post.score / 50, 1.0)  # Lower threshold for historical data
        
        # Upvote ratio (higher is better)
        ratio_factor = post.upvote_ratio
        
        # Comment engagement
        comment_factor = min(post.num_comments / 25, 1.0)  # Lower threshold
        
        # Author credibility (if available)
        author_factor = 0.1 if post.author and hasattr(post.author, 'comment_karma') else 0
        if author_factor and post.author.comment_karma > 500:  # Lower threshold
            author_factor = 0.2
        
        reliability = (score_factor * 0.3 + 
                      ratio_factor * 0.4 + 
                      comment_factor * 0.2 + 
                      author_factor * 0.1)
        
        return round(reliability, 3)
    
    def _extract_comments(self, post, limit=5):
        """Extract top comments from a post"""
        
        comments = []
        try:
            post.comments.replace_more(limit=1)
            
            top_comments = sorted(post.comments, key=lambda x: x.score, reverse=True)
            
            for comment in top_comments[:limit]:
                if hasattr(comment, 'body') and comment.score > 0:
                    comment_data = {
                        'body': comment.body,
                        'score': comment.score,
                        'created_utc': datetime.fromtimestamp(comment.created_utc)
                    }
                    comments.append(comment_data)
        except:
            pass
            
        return comments
    
    def analyze_historical_sentiment(self, historical_data):
        """Analyze sentiment for all historical data"""
        
        if not VADER_AVAILABLE:
            print("❌ Cannot analyze sentiment without VADER")
            return []
        
        print(f"\n🔬 Analyzing sentiment for {len(historical_data)} posts...")
        
        analyzed_data = []
        
        for post in historical_data:
            try:
                # Combine title and text
                full_text = post['title']
                if post['selftext']:
                    full_text += ' ' + post['selftext']
                
                # Clean text
                clean_text = self._clean_text(full_text)
                
                if len(clean_text.split()) < 3:
                    continue
                
                # VADER sentiment
                vader_scores = self.vader_analyzer.polarity_scores(clean_text)
                
                # TextBlob if available
                textblob_polarity = 0
                if TEXTBLOB_AVAILABLE:
                    try:
                        blob = TextBlob(clean_text)
                        textblob_polarity = blob.sentiment.polarity
                    except:
                        pass
                
                # Combined sentiment
                if TEXTBLOB_AVAILABLE and textblob_polarity != 0:
                    combined_sentiment = (vader_scores['compound'] * 0.7) + (textblob_polarity * 0.3)
                else:
                    combined_sentiment = vader_scores['compound']
                
                # Get sentiment label
                if combined_sentiment >= 0.1:
                    sentiment_label = 'Positive'
                elif combined_sentiment <= -0.1:
                    sentiment_label = 'Negative'
                else:
                    sentiment_label = 'Neutral'
                
                # Create analysis
                analysis = {
                    **post,
                    'clean_text': clean_text,
                    'vader_compound': vader_scores['compound'],
                    'vader_positive': vader_scores['pos'],
                    'vader_negative': vader_scores['neg'],
                    'vader_neutral': vader_scores['neu'],
                    'textblob_polarity': textblob_polarity,
                    'combined_sentiment': round(combined_sentiment, 3),
                    'sentiment_label': sentiment_label,
                    'overall_engagement': post['score'] + post['num_comments']
                }
                
                analyzed_data.append(analysis)
                
            except Exception as e:
                print(f"   ⚠️ Error analyzing post: {e}")
                continue
        
        print(f"✅ Sentiment analysis complete: {len(analyzed_data)} posts")
        return analyzed_data
    
    def _clean_text(self, text):
        """Clean text for sentiment analysis"""
        
        # Remove URLs
        text = re.sub(r'http[s]?://\S+', '', text)
        
        # Remove Reddit formatting
        text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
        text = re.sub(r'\/u\/\w+', '', text)
        text = re.sub(r'\/r\/\w+', '', text)
        
        # Remove special characters but keep Spanish accents
        text = re.sub(r'[^\w\s\u00C0-\u024F\u1E00-\u1EFF]', ' ', text)
        
        # Clean whitespace
        text = ' '.join(text.split())
        
        return text.strip()
    
    def generate_historical_report(self, analyzed_data):
        """Generate comprehensive historical report"""
        
        if not analyzed_data:
            print("❌ No data to analyze")
            return None
        
        df = pd.DataFrame(analyzed_data)
        
        print(f"\n📊 REAL MADRID HISTORICAL SENTIMENT REPORT")
        print(f"=" * 60)
        
        # Season overview
        total_weeks = df['week_number'].nunique()
        total_posts = len(df)
        
        season_start = df['week_start'].min().strftime('%Y-%m-%d')
        latest_week = df['week_start'].max().strftime('%Y-%m-%d')
        
        print(f"🏆 SEASON OVERVIEW:")
        print(f"   📅 Period: {season_start} to {latest_week}")
        print(f"   📊 Total weeks: {total_weeks}")
        print(f"   📈 Total posts: {total_posts}")
        print(f"   📊 Average posts per week: {total_posts/total_weeks:.1f}")
        
        # Weekly sentiment trends
        weekly_stats = df.groupby('week_number').agg({
            'combined_sentiment': 'mean',
            'sentiment_label': lambda x: x.value_counts().index[0],
            'overall_engagement': 'sum',
            'post_id': 'count',
            'week_start': 'first'
        }).round(3)
        
        print(f"\n📈 WEEKLY SENTIMENT TRENDS:")
        print(f"{'Week':<4} {'Date':<10} {'Sentiment':<9} {'Mood':<8} {'Posts':<5} {'Engagement':<10}")
        print(f"-" * 55)
        
        for week_num, row in weekly_stats.iterrows():
            date_str = row['week_start'].strftime('%m/%d')
            sentiment_str = f"{row['combined_sentiment']:.3f}"
            mood_emoji = "😊" if row['combined_sentiment'] > 0.1 else "😞" if row['combined_sentiment'] < -0.1 else "😐"
            
            print(f"{week_num:<4} {date_str:<10} {sentiment_str:<9} {mood_emoji:<8} {row['post_id']:<5} {row['overall_engagement']:<10}")
        
        # Competition-specific analysis
        print(f"\n🏆 COMPETITION-SPECIFIC SENTIMENT:")
        
        # Flatten competition context for analysis
        comp_data = []
        for _, row in df.iterrows():
            for comp in row['competition_context']:
                comp_entry = row.copy()
                comp_entry['competition'] = comp
                comp_data.append(comp_entry)
        
        if comp_data:
            comp_df = pd.DataFrame(comp_data)
            comp_sentiment = comp_df.groupby('competition').agg({
                'combined_sentiment': 'mean',
                'post_id': 'count'
            }).round(3)
            
            for comp, row in comp_sentiment.iterrows():
                mood = "😊" if row['combined_sentiment'] > 0.1 else "😞" if row['combined_sentiment'] < -0.1 else "😐"
                print(f"   {comp.replace('_', ' ').title()}: {row['combined_sentiment']:.3f} {mood} ({row['post_id']} posts)")
        
        # Overall season sentiment
        overall_sentiment = df['combined_sentiment'].mean()
        overall_mood = "😊 POSITIVE SEASON" if overall_sentiment > 0.1 else "😞 CHALLENGING SEASON" if overall_sentiment < -0.1 else "😐 MIXED SEASON"
        
        print(f"\n🎯 SEASON SUMMARY:")
        print(f"   📊 Overall sentiment: {overall_sentiment:.3f}")
        print(f"   🎭 Season mood: {overall_mood}")
        
        # Best and worst weeks
        best_week = weekly_stats.loc[weekly_stats['combined_sentiment'].idxmax()]
        worst_week = weekly_stats.loc[weekly_stats['combined_sentiment'].idxmin()]
        
        print(f"   📈 Best week: Week {weekly_stats['combined_sentiment'].idxmax()} ({best_week['combined_sentiment']:.3f})")
        print(f"   📉 Worst week: Week {weekly_stats['combined_sentiment'].idxmin()} ({worst_week['combined_sentiment']:.3f})")
        
        # Reliability assessment
        avg_reliability = df['reliability_score'].mean()
        print(f"   🎯 Data reliability: {avg_reliability:.3f}")
        
        # Create structured report
        report = {
            'season_overview': {
                'start_date': season_start,
                'latest_week': latest_week,
                'total_weeks': total_weeks,
                'total_posts': total_posts,
                'avg_posts_per_week': total_posts/total_weeks
            },
            'overall_sentiment': overall_sentiment,
            'season_mood': overall_mood,
            'weekly_trends': weekly_stats.to_dict('index'),
            'competition_sentiment': comp_sentiment.to_dict('index') if comp_data else {},
            'best_week': {
                'week': int(weekly_stats['combined_sentiment'].idxmax()),
                'sentiment': float(best_week['combined_sentiment'])
            },
            'worst_week': {
                'week': int(weekly_stats['combined_sentiment'].idxmin()),
                'sentiment': float(worst_week['combined_sentiment'])
            },
            'reliability_score': avg_reliability
        }
        
        return report, df
    
    def save_historical_data(self, analyzed_data, report=None, filename=None):
        """Save historical data and report"""
        
        if filename is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f'real_madrid_historical_sentiment_{timestamp}'
        
        # Save detailed data
        df = pd.DataFrame(analyzed_data)
        df.to_csv(f'{filename}.csv', index=False, encoding='utf-8')
        
        # Save report if provided
        if report:
            with open(f'{filename}_report.json', 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=2, ensure_ascii=False, default=str)
        
        print(f"\n💾 Historical data saved:")
        print(f"   📊 {filename}.csv ({len(df)} posts)")
        if report:
            print(f"   📋 {filename}_report.json")
    
    def collect_full_season(self, save_weekly=True, max_weeks=None):
        """
        Collect sentiment data for the entire season week by week
        
        Args:
            save_weekly (bool): Save data after each week
            max_weeks (int): Limit number of weeks (for testing)
        """
        
        print("🏆 COLLECTING FULL SEASON REAL MADRID SENTIMENT DATA")
        print("=" * 60)
        
        # Get all weeks
        all_weeks = self.get_season_weeks()
        
        if max_weeks:
            all_weeks = all_weeks[:max_weeks]
            print(f"⚠️  Limited to first {max_weeks} weeks for testing")
        
        print(f"📅 Total weeks to process: {len(all_weeks)}")
        
        # Collect data for each week
        all_historical_data = []
        
        for i, week_info in enumerate(all_weeks, 1):
            print(f"\n🔄 Processing week {i}/{len(all_weeks)}...")
            
            try:
                week_data = self.collect_weekly_data(week_info)
                
                if week_data:
                    # Analyze sentiment immediately
                    week_analyzed = self.analyze_historical_sentiment(week_data)
                    all_historical_data.extend(week_analyzed)
                    
                    # Save weekly if requested
                    if save_weekly and week_analyzed:
                        week_filename = f"week_{week_info['week_number']:02d}_{week_info['start_date'].strftime('%Y%m%d')}"
                        self.save_historical_data(week_analyzed, filename=week_filename)
                
                # Rate limiting between weeks
                time.sleep(1)
                
            except Exception as e:
                print(f"❌ Error processing week {week_info['week_number']}: {e}")
                continue
        
        print(f"\n✅ COLLECTION COMPLETE!")
        print(f"📊 Total posts collected: {len(all_historical_data)}")
        
        if all_historical_data:
            # Generate full season report
            report, df = self.generate_historical_report(all_historical_data)
            
            # Save full season data
            self.save_historical_data(all_historical_data, report, "full_season_sentiment")
            
            return all_historical_data, report
        
        return [], None

# Main execution
def main():
    """Main function to collect historical Real Madrid sentiment"""
    
    print("⚽ REAL MADRID HISTORICAL WEEKLY SENTIMENT ANALYZER")
    print("🏆 From La Liga & Champions League Season Start")
    print("=" * 60)
    
    # Reddit API configuration
    reddit_config = {
        'client_id': 'your_client_id',
        'client_secret': 'your_client_secret',
        'user_agent': 'real_madrid_historical_sentiment_v1.0'
    }
    
    # Initialize analyzer
    analyzer = HistoricalRealMadridSentiment(reddit_config)
    
    # Show season overview
    weeks = analyzer.get_season_weeks()
    print(f"\n📅 SEASON OVERVIEW:")
    print(f"   🏆 Total weeks available: {len(weeks)}")
    print(f"   📅 Season start: {analyzer.season_start.strftime('%Y-%m-%d')}")
    print(f"   📅 Current week: Week {len(weeks)}")
    
    # Ask user for collection scope
    print(f"\n🔧 COLLECTION OPTIONS:")
    print(f"1. Full season (all {len(weeks)} weeks) - May take 1-2 hours")
    print(f"2. Last 4 weeks (testing)")
    print(f"3. Specific week range")
    
    choice = input("Choose option (1-3): ").strip()
    
    if choice == "1":
        # Full season collection
        print(f"\n🚀 Starting full season collection...")
        historical_data, report = analyzer.collect_full_season(save_weekly=True)
        
    elif choice == "2":
        # Test with last 4 weeks
        print(f"\n🧪 Testing with last 4 weeks...")
        historical_data, report = analyzer.collect_full_season(save_weekly=False, max_weeks=4)
        
    elif choice == "3":
        # Custom range
        start_week = int(input(f"Start week (1-{len(weeks)}): "))
        end_week = int(input(f"End week ({start_week}-{len(weeks)}): "))
        
        custom_weeks = weeks[start_week-1:end_week]
        print(f"\n🎯 Processing weeks {start_week} to {end_week}...")
        
        # Process custom range
        all_data = []
        for week_info in custom_weeks:
            week_data = analyzer.collect_weekly_data(week_info)
            if week_data:
                analyzed = analyzer.analyze_historical_sentiment(week_data)
                all_data.extend(analyzed)
        
        if all_data:
            report, df = analyzer.generate_historical_report(all_data)
            analyzer.save_historical_data(all_data, report)
        
        historical_data = all_data
    
    else:
        print("❌ Invalid choice")
        return
    
    if historical_data:
        print(f"\n🎯 ANALYSIS COMPLETE!")
        print(f"📊 Total posts analyzed: {len(historical_data)}")
        if report:
            print(f"📈 Season sentiment: {report['overall_sentiment']:.3f}")
            print(f"🎭 Season mood: {report['season_mood']}")
    else:
        print("❌ No data collected")

if __name__ == "__main__":
    print("📋 SETUP FOR HISTORICAL ANALYSIS:")
    print("=" * 40)
    print("1. Install: pip install praw pandas vaderSentiment")
    print("2. Get Reddit API credentials")
    print("3. This will collect data from August 2024 to present")
    print("4. Full season collection may take 1-2 hours")
    print("5. Data is saved weekly for backup")
    print("\n" + "="*50)
    
    # Check if user wants to run
    run_choice = input("Ready to start historical collection? (y/n): ").strip().lower()
    if run_choice == 'y':
        main()
    else:
        print("Setup your Reddit credentials first, then run the script!")

📋 SETUP FOR HISTORICAL ANALYSIS:
1. Install: pip install praw pandas vaderSentiment
2. Get Reddit API credentials
3. This will collect data from August 2024 to present
4. Full season collection may take 1-2 hours
5. Data is saved weekly for backup

⚽ REAL MADRID HISTORICAL WEEKLY SENTIMENT ANALYZER
🏆 From La Liga & Champions League Season Start
✅ VADER sentiment analyzer loaded

📅 SEASON OVERVIEW:
   🏆 Total weeks available: 48
   📅 Season start: 2024-08-01
   📅 Current week: Week 48

🔧 COLLECTION OPTIONS:
1. Full season (all 48 weeks) - May take 1-2 hours
2. Last 4 weeks (testing)
3. Specific week range

🚀 Starting full season collection...
🏆 COLLECTING FULL SEASON REAL MADRID SENTIMENT DATA
📅 Total weeks to process: 48

🔄 Processing week 1/48...

📅 Week 1: 2024-08-01 to 2024-08-07
🏆 Active competitions: Off-season
   📍 Searching r/realmadrid...
     ✅ Found 0 relevant posts
   📍 Searching r/soccer...
     ✅ Found 0 relevant posts
   📍 Searching r/LaLiga...
     ✅ Found 0 relevant pos