<a href="https://colab.research.google.com/github/Azaidi317/LLM-Finetuning-Projects/blob/main/stock_sentiment_using_Microsoft_phi2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install praw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


In [None]:
import praw
import pandas as pd
from datetime import datetime
from transformers import pipeline
from tqdm import tqdm
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import yfinance as yf

class AdvancedStockAnalyzer:
    def __init__(self, client_id, client_secret, user_agent):
        # Initialize Reddit API
        self.reddit = praw.Reddit(
            client_id='Nr1OwEqV_a8GVY3_jxU9-w',
            client_secret='sO0micrWKauECqX7bkR4ztxtLxtsEA',
            user_agent='stock_bot/1.0 by Same_Can_7313',
            check_for_async=False
        )

        # Initialize advanced sentiment analyzer
        # Using BloombergGPT fine-tuned for financial analysis
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "microsoft/phi-2",
            num_labels=5,  # Strong Bearish, Bearish, Neutral, Bullish, Strong Bullish
            trust_remote_code=True
        )

        # Move model to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Custom prompts for better analysis
        self.prompt_template = """
        Analyze the following text about {stock_symbol} stock and determine the sentiment:

        Text: {text}

        Consider the following factors:
        1. Overall market sentiment
        2. Financial metrics mentioned
        3. Technical analysis indicators
        4. Company-specific news
        5. Industry trends

        Classify the sentiment as one of:
        - Strong Bearish (-2)
        - Bearish (-1)
        - Neutral (0)
        - Bullish (1)
        - Strong Bullish (2)

        Also extract key points that support this sentiment.
        """

    def get_stock_data(self, symbol, period="1mo"):
        """Get recent stock data for context"""
        try:
            stock = yf.Ticker(symbol)
            history = stock.history(period=period)
            return history
        except Exception as e:
            print(f"Error getting stock data: {str(e)}")
            return None

    def analyze_text_sentiment(self, text, stock_symbol):
        """Advanced sentiment analysis with context"""
        try:
            # Format prompt with context
            formatted_prompt = self.prompt_template.format(
                stock_symbol=stock_symbol,
                text=text
            )

            # Tokenize and analyze
            inputs = self.tokenizer(
                formatted_prompt,
                return_tensors="pt",
                truncation=True,
                max_length=512
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

            # Map predictions to sentiments
            sentiment_map = {
                0: "Strong Bearish",
                1: "Bearish",
                2: "Neutral",
                3: "Bullish",
                4: "Strong Bullish"
            }

            sentiment_idx = torch.argmax(predictions).item()
            confidence = predictions[0][sentiment_idx].item()

            # Calculate sentiment score (-2 to 2)
            sentiment_score = sentiment_idx - 2

            return {
                'label': sentiment_map[sentiment_idx],
                'score': sentiment_score,
                'confidence': confidence
            }

        except Exception as e:
            print(f"Error analyzing sentiment: {str(e)}")
            return {
                'label': "Neutral",
                'score': 0,
                'confidence': 0
            }

    def analyze_with_market_context(self, text, stock_symbol, stock_data=None):
        """Analyze sentiment with market context"""
        # Get basic sentiment
        base_sentiment = self.analyze_text_sentiment(text, stock_symbol)

        if stock_data is not None:
            # Calculate recent price trend
            recent_return = (
                stock_data['Close'][-1] / stock_data['Close'][0] - 1
            ) * 100

            # Adjust sentiment based on market context
            if abs(recent_return) > 10:  # Significant price movement
                if recent_return > 0:
                    base_sentiment['score'] *= 1.2  # Amplify positive sentiment
                else:
                    base_sentiment['score'] *= 0.8  # Dampen negative sentiment

        return base_sentiment

    def collect_and_analyze(self, subreddit_name="wallstreetbets",
                          stock_symbol="NVDA", limit=100):
        """Collect posts and perform advanced analysis"""
        try:
            # Get stock data for context
            stock_data = self.get_stock_data(stock_symbol)

            # Access subreddit
            subreddit = self.reddit.subreddit(subreddit_name)
            posts_data = []

            print(f"Analyzing posts about {stock_symbol} from r/{subreddit_name}...")

            for post in tqdm(subreddit.search(stock_symbol, limit=limit, sort='new')):
                # Combine title and text
                full_text = f"{post.title} {post.selftext}"

                # Analyze sentiment with market context
                sentiment = self.analyze_with_market_context(
                    full_text,
                    stock_symbol,
                    stock_data
                )

                post_data = {
                    'title': post.title,
                    'text': post.selftext,
                    'score': post.score,
                    'created_utc': datetime.fromtimestamp(post.created_utc),
                    'num_comments': post.num_comments,
                    'url': f"https://reddit.com{post.permalink}",
                    'sentiment': sentiment['label'],
                    'sentiment_score': sentiment['score'],
                    'confidence': sentiment['confidence']
                }
                posts_data.append(post_data)
                time.sleep(0.5)

            # Create DataFrame
            df = pd.DataFrame(posts_data)

            # Calculate weighted sentiment score
            df['weighted_sentiment'] = df['sentiment_score'] * df['score'] * df['confidence']
            overall_sentiment = df['weighted_sentiment'].sum() / df['score'].sum()

            # Save results
            filename = f"{stock_symbol}_advanced_sentiment.csv"
            df.to_csv(filename, index=False)

            return df, overall_sentiment

        except Exception as e:
            print(f"Error in analysis: {str(e)}")
            return None, None

# Example usage
if __name__ == "__main__":
    # Initialize analyzer
    analyzer = AdvancedStockAnalyzer(
        client_id="your_client_id",
        client_secret="your_client_secret",
        user_agent="AdvancedStockBot/1.0"
    )

    # Analyze stock
    df, overall_sentiment = analyzer.collect_and_analyze(
        subreddit_name="wallstreetbets",
        stock_symbol="NVDA",
        limit=50
    )

    if df is not None:
        print("\nAnalysis Results:")
        print(f"Overall Sentiment Score: {overall_sentiment:.2f}")
        print("\nSentiment Distribution:")
        print(df['sentiment'].value_counts(normalize=True))

        print("\nMost Impactful Posts:")
        impact_posts = df.nlargest(5, 'weighted_sentiment')
        for _, post in impact_posts.iterrows():
            print(f"\nTitle: {post['title']}")
            print(f"Sentiment: {post['sentiment']} (Score: {post['sentiment_score']:.2f})")
            print(f"Confidence: {post['confidence']:.2f}")
            print(f"Post Score: {post['score']}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Analyzing posts about NVDA from r/wallstreetbets...


  stock_data['Close'][-1] / stock_data['Close'][0] - 1
50it [00:32,  1.55it/s]


Analysis Results:
Overall Sentiment Score: 0.98

Sentiment Distribution:
sentiment
Strong Bullish    0.88
Bearish           0.10
Strong Bearish    0.02
Name: proportion, dtype: float64

Most Impactful Posts:

Title: YOLO $35k into 1.1MM to save the family farm. 
Sentiment: Strong Bullish (Score: 2.00)
Confidence: 0.65
Post Score: 12004

Title: I ded it agan
Sentiment: Strong Bullish (Score: 2.00)
Confidence: 0.40
Post Score: 3805

Title: Started taking NVidia gains after 12 years
Sentiment: Strong Bullish (Score: 2.00)
Confidence: 0.44
Post Score: 3003

Title: Mega Bull is about to be unleashed
Sentiment: Strong Bullish (Score: 2.00)
Confidence: 0.34
Post Score: 2708

Title: I shorted MSTR; no such thing as infinite money glitch 
Sentiment: Strong Bullish (Score: 2.00)
Confidence: 0.49
Post Score: 1074





Code to read it from all subreddits, not just one!

In [8]:
import praw
import pandas as pd
from datetime import datetime, timedelta
from transformers import pipeline
from tqdm import tqdm
import time

class RedditStockAnalyzer:
    def __init__(self, client_id, client_secret, user_agent):
        # Initialize Reddit API
        self.reddit = praw.Reddit(
            client_id='Nr1OwEqV_a8GVY3_jxU9-w',
            client_secret='sO0micrWKauECqX7bkR4ztxtLxtsEA',
            user_agent='stock_bot/1.0 by Same_Can_7313',
            check_for_async=False
        )

        # Initialize sentiment analyzer
        self.sentiment_analyzer = pipeline(
            "sentiment-analysis",
            model="ProsusAI/finbert"
        )

        # Popular finance-related subreddits to check individually
        self.finance_subreddits = [
            "wallstreetbets", "stocks", "investing", "stockmarket",
            "options", "pennystocks", "cryptocurrency", "finance",
            "business", "SecurityAnalysis", "algotrading"
        ]

    def analyze_text_sentiment(self, text):
        """Analyze sentiment of text"""
        try:
            if not text or pd.isna(text):
                return {'label': 'neutral', 'score': 0.0}
            result = self.sentiment_analyzer(text[:512])[0]
            return result
        except Exception as e:
            print(f"Sentiment analysis error: {str(e)}")
            return {'label': 'neutral', 'score': 0.0}

    def search_all_reddit(self, stock_symbol, time_filter='week', limit=100):
        """Search all of Reddit for stock mentions"""
        all_posts = []

        try:
            print(f"Searching all Reddit for {stock_symbol}...")

            # Search r/all
            for submission in tqdm(self.reddit.subreddit("all").search(
                f'"{stock_symbol}"', # Exact match search
                sort='hot',
                time_filter=time_filter,
                limit=limit
            )):
                # Basic filtering to avoid false positives
                if self._is_valid_stock_mention(submission.title + " " + submission.selftext, stock_symbol):
                    post_data = self._extract_post_data(submission, stock_symbol, "r/all")
                    all_posts.append(post_data)

            # Search specific finance subreddits
            for subreddit in self.finance_subreddits:
                try:
                    print(f"\nSearching r/{subreddit}...")
                    subreddit_posts = self.reddit.subreddit(subreddit).search(
                        f'"{stock_symbol}"',
                        sort='hot',
                        time_filter=time_filter,
                        limit=int(limit/2)  # Smaller limit for individual subreddits
                    )

                    for submission in subreddit_posts:
                        if self._is_valid_stock_mention(submission.title + " " + submission.selftext, stock_symbol):
                            post_data = self._extract_post_data(submission, stock_symbol, subreddit)
                            all_posts.append(post_data)

                except Exception as e:
                    print(f"Error searching r/{subreddit}: {str(e)}")
                    continue

                time.sleep(0.5)  # Respect rate limits

            # Convert to DataFrame and remove duplicates
            df = pd.DataFrame(all_posts)
            df = df.drop_duplicates(subset=['id'])

            # Calculate sentiment statistics
            stats = self._calculate_statistics(df)

            # Save results
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{stock_symbol}_reddit_all_{timestamp}.csv"
            df.to_csv(filename, index=False)
            print(f"\nData saved to {filename}")

            return df, stats

        except Exception as e:
            print(f"Error searching Reddit: {str(e)}")
            return None, None

    def _is_valid_stock_mention(self, text, symbol):
        """Check if stock mention is likely valid"""
        # Convert text to uppercase for comparison
        text = text.upper()
        symbol = symbol.upper()

        # Check for exact match with word boundaries
        if f" {symbol} " in f" {text} ":
            # Additional checks to avoid false positives
            common_words = ["THE", "AND", "BUT", "FOR", "ARE"]
            if symbol in common_words:
                return False

            # Check for context
            financial_terms = ["STOCK", "SHARE", "PRICE", "MARKET", "TRADE", "BUY", "SELL", "$"]
            return any(term in text for term in financial_terms)

        return False

    def _extract_post_data(self, submission, symbol, subreddit):
        """Extract relevant data from a submission"""
        # Combine title and text for sentiment analysis
        full_text = f"{submission.title} {submission.selftext}"
        sentiment = self.analyze_text_sentiment(full_text)

        return {
            'id': submission.id,
            'title': submission.title,
            'text': submission.selftext,
            'subreddit': subreddit,
            'score': submission.score,
            'upvote_ratio': submission.upvote_ratio,
            'num_comments': submission.num_comments,
            'created_utc': datetime.fromtimestamp(submission.created_utc),
            'url': f"https://reddit.com{submission.permalink}",
            'sentiment': sentiment['label'],
            'sentiment_score': sentiment['score'],
            'symbol': symbol
        }

    def _calculate_statistics(self, df):
        """Calculate various statistics about the posts"""
        stats = {
            'total_posts': len(df),
            'unique_subreddits': df['subreddit'].nunique(),
            'total_comments': df['num_comments'].sum(),
            'avg_score': df['score'].mean(),
            'sentiment_distribution': df['sentiment'].value_counts().to_dict(),
            'top_subreddits': df['subreddit'].value_counts().head(5).to_dict(),
            'high_impact_posts': df.nlargest(5, 'score')[
                ['title', 'subreddit', 'sentiment', 'score', 'url']
            ].to_dict('records')
        }
        return stats

def print_analysis(symbol, df, stats):
    """Print analysis results in a readable format"""
    print(f"\nAnalysis Results for {symbol}")
    print("=" * 50)

    print(f"\nTotal Posts Found: {stats['total_posts']}")
    print(f"Unique Subreddits: {stats['unique_subreddits']}")
    print(f"Total Comments: {stats['total_comments']}")
    print(f"Average Score: {stats['avg_score']:.2f}")

    print("\nSentiment Distribution:")
    for sentiment, count in stats['sentiment_distribution'].items():
        percentage = (count / stats['total_posts']) * 100
        print(f"{sentiment}: {count} posts ({percentage:.1f}%)")

    print("\nTop Subreddits:")
    for subreddit, count in stats['top_subreddits'].items():
        print(f"{subreddit}: {count} posts")

    print("\nMost Impactful Posts:")
    for post in stats['high_impact_posts']:
        print(f"\nTitle: {post['title']}")
        print(f"Subreddit: {post['subreddit']}")
        print(f"Score: {post['score']}")
        print(f"Sentiment: {post['sentiment']}")
        print(f"URL: {post['url']}")

# Example usage
if __name__ == "__main__":
    # Initialize analyzer
    analyzer = RedditStockAnalyzer(
        client_id="YOUR_CLIENT_ID",
        client_secret="YOUR_CLIENT_SECRET",
        user_agent="StockMentionBot/1.0"
    )

    # Search for stock mentions
    stock_symbol = "NVDA"  # Change to any stock symbol
    df, stats = analyzer.search_all_reddit(
        stock_symbol=stock_symbol,
        time_filter='week',  # Options: hour, day, week, month, year, all
        limit=200
    )

    if df is not None and stats is not None:
        print_analysis(stock_symbol, df, stats)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Searching all Reddit for NVDA...


200it [00:09, 21.59it/s]



Searching r/wallstreetbets...

Searching r/stocks...

Searching r/investing...

Searching r/stockmarket...

Searching r/options...

Searching r/pennystocks...

Searching r/cryptocurrency...

Searching r/finance...

Searching r/business...

Searching r/SecurityAnalysis...

Searching r/algotrading...

Data saved to NVDA_reddit_all_20241122_005735.csv

Analysis Results for NVDA

Total Posts Found: 79
Unique Subreddits: 4
Total Comments: 2211
Average Score: 44.95

Sentiment Distribution:
neutral: 51 posts (64.6%)
positive: 14 posts (17.7%)
negative: 14 posts (17.7%)

Top Subreddits:
r/all: 73 posts
wallstreetbets: 3 posts
options: 2 posts
stockmarket: 1 posts

Most Impactful Posts:

Title: Is RKLB the new NVDA or something? How is this kind of stock rise even possible??
Subreddit: r/all
Score: 723
Sentiment: negative
URL: https://reddit.com/r/wallstreetbets/comments/1gs0tva/is_rklb_the_new_nvda_or_something_how_is_this/

Title: Day 12 of the 1k account challenge 
Subreddit: r/all
Score: 6