In [2]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client["text_mining"]
mongo_comments = db["comments"]
mongo_posts = db["posts"]
post_ids = mongo_posts.distinct('id')
post_ids2 = mongo_comments.distinct('post_id')

In [3]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
new_words = {
    "bull": 1,
    "bullish": 1,
    "moon": 1,
    "tank": -1,
    "bear": -1,
    "bearish": -1,
    "calls": 1,
    "puts": -1,
    "green": +1,
    "red": -1
}
sid.lexicon.update(new_words)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/benjoso/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:

def analyze_comments(comments):
    score = []
    for comment in comments:
        score.append(sid.polarity_scores(comment["body"]))
    return score

def analyze_text(text):
    return sid.polarity_scores(text)

In [5]:
def analyze_score_individual_comments(scores):
    total_ammount_negative, total_ammount_positive, 
    total_polarity_positive, total_polarity_negative = 0

    for score in scores:
        ammount_negative, ammount_positive,
        ammount_neutral, polarity = 0
        
        for entry in score:
            if entry["compound"] > 0:
                ammount_positive += 1
            elif entry["compond"] == 0:
                ammount_neutral += 1
            else:
                ammount_negative += 1
            polarity += entry["compound"]
                
        if negative < positive:
            total_ammount_positive += 1
        elif positive < negative:
            total_ammount_negative += 1
        if polarity > 0:
            total_polarity_positive += 1
        elif polarity < 0:
            total_polarity_negative += 1  
    return total_ammount_positive, total_ammount_negative, total_polarity_positive, total_polarity_negative
        
def analyze_score_combined_comments(scores):
    total_negative = 0
    total_positive = 0
    for score in scores:
        if score["compound"] > 0:
            total_positive += 1
        elif score["compound"] < 0:
            total_negative += 1
    return total_negative, total_positive

In [263]:
from datetime import datetime, timedelta
from pytz import timezone
scores_individual_comments = []
scores_combined_comments = []
for post_id in post_ids:
    post = mongo_posts.find({"id": post_id})[0]
    comments = mongo_comments.find({"post_id": post_id})
    
    # get the time of the next days opening
    dt_object = datetime.fromtimestamp(post["created_utc"])
    next_day = dt_object.astimezone(timezone("EST")) + timedelta(days=1)
    next_day_opening = next_day.replace(hour=9, minute = 30, second = 0).timestamp()

    #Filter out all comments that are commented after the opening bell
    comments_before_opening = []
    for comment in comments:
        
        if datetime.fromtimestamp(comment["created_utc"]).timestamp() < next_day_opening:
            comments_before_opening.append(comment)
    
    scores_individual_comments.append(analyze_comments(comments_before_opening))
    scores_combined_comments.append(analyze_text(" ".join([c["body"] for c in comments_before_opening])))

tap, tan, tpp, tpn = analyze_score_individual_comments(scores_individual_comments)

total_n, total_p = analyze_score_combined_comments(scores_combined_comments)



KeyboardInterrupt: 

## Add sentiment score to all comments

In [6]:
count = 0
for post_id in post_ids:
    
    post = mongo_posts.find({"id": post_id})[0]
    comments = mongo_comments.find({"post_id": post_id})
    for comment in comments:
        count += 1
        if count % 1000 == 0:
            print(f"Updating comment number {count}")
        polarity_score = sid.polarity_scores(comment["body"])
        mongo_comments.update_one({"id": comment["id"],"post_id": post_id},{"$set": {"sentiment_score": polarity_score}})



Updating comment number 1000
Updating comment number 2000
Updating comment number 3000
Updating comment number 4000
Updating comment number 5000
Updating comment number 6000
Updating comment number 7000
Updating comment number 8000
Updating comment number 9000
Updating comment number 10000
Updating comment number 11000
Updating comment number 12000
Updating comment number 13000
Updating comment number 14000
Updating comment number 15000
Updating comment number 16000
Updating comment number 17000
Updating comment number 18000
Updating comment number 19000
Updating comment number 20000
Updating comment number 21000
Updating comment number 22000
Updating comment number 23000
Updating comment number 24000
Updating comment number 25000
Updating comment number 26000
Updating comment number 27000
Updating comment number 28000
Updating comment number 29000
Updating comment number 30000
Updating comment number 31000
Updating comment number 32000
Updating comment number 33000
Updating comment nu

# ALPHA VANTAGE download

In [None]:
from alpha_vantage.timeseries import TimeSeries
import pandas
API_KEY = "M8RF5RFRF8PG32I1"
ts = TimeSeries(key="M8RF5RFRF8PG32I1", output_format='pandas')
res = ts.get_daily_adjusted(symbol="SPY", outputsize="full")[0]
res = res.drop(["1. open", "2. high", "3. low", "4. close", "6. volume", "7. dividend amount", "8. split coefficient"], axis=1)
res = res.rename(columns={'5. adjusted close': "close"})


## Add close price to comments

In [None]:
from datetime import datetime, timedelta
from pytz import timezone

one_day_offset = timedelta(days=1)

for post_id in post_ids:
    post = mongo_posts.find({"id": post_id})[0]
    comments = mongo_comments.find({"post_id": post_id})
    for comment in comments:
        start_day = datetime.fromtimestamp(comment["created_utc"]).date()
        
        date = datetime.fromtimestamp(comment["created_utc"]).date() + one_day_offset
        while res.loc[res.index == str(date)].empty:
            date = date + one_day_offset 
        next_trading_day = date
        
        date = start_day
        while res.loc[res.index == str(date)].empty:
            date = date - one_day_offset
        previous_trading_day = date
        
        next_close = float(res.loc[res.index == str(next_trading_day)]["close"])
        prev_close = float(res.loc[res.index == str(previous_trading_day)]["close"])
        
        positive_day = None
        
        if next_close >= prev_close:
            positive_day = True
        else:
            positive_day = False
        development = 1 - (next_close/prev_close)
        mongo_comments.update_one({"id": comment["id"],"post_id": post_id},{"$set": {"spy_closing_price": next_close}})
    
        mongo_comments.update_one({"id": comment["id"],"post_id": post_id},{"$set": {"next_trading_day_positive": positive_day}})
        
        mongo_comments.update_one({"id": comment["id"],"post_id": post_id},{"$set": {"next_trading_day_development": development}})
   

# Analyze results

## First analysis

In [265]:
positive_day_positive_sentiment = 0
positive_day_negative_sentiment = 0
positive_day_neutral_sentiment = 0
negative_day_positive_sentiment = 0
negative_day_negative_sentiment = 0
negative_day_neutral_sentiment = 0

for comment in mongo_comments.find():
    positive_day = comment["next_trading_day_positive"]
    sentiment = comment["sentiment_score"]["compound"]
    if positive_day:
        if sentiment > 0:
            positive_day_positive_sentiment += 1
        elif sentiment < 0:
            positive_day_negative_sentiment += 1
        else:
            positive_day_neutral_sentiment += 1
    else:
        if sentiment > 0:
            negative_day_positive_sentiment += 1
        elif sentiment < 0:
            negative_day_negative_sentiment += 1
        else:
            negative_day_neutral_sentiment += 1
print(f"{positive_day_positive_sentiment=}")
print(f"{positive_day_negative_sentiment=}")
print(f"{positive_day_neutral_sentiment=}")
print(f"{negative_day_positive_sentiment=}")
print(f"{negative_day_negative_sentiment=}")
print(f"{negative_day_neutral_sentiment=}")

    
        
    

positive_day_positive_sentiment=287403
positive_day_negative_sentiment=246476
positive_day_neutral_sentiment=303434
negative_day_positive_sentiment=242639
negative_day_negative_sentiment=209308
negative_day_neutral_sentiment=262262


In [266]:
print(f"positive comments on positive days: {positive_day_positive_sentiment/(positive_day_positive_sentiment + positive_day_negative_sentiment)}")
print(f"negative comments on positive days: {positive_day_negative_sentiment/(positive_day_positive_sentiment + positive_day_negative_sentiment)}")
print(f"positive comments on negative days: {negative_day_positive_sentiment/(negative_day_positive_sentiment+negative_day_negative_sentiment)}")
print(f"negative comments on negative days: {negative_day_negative_sentiment/(negative_day_positive_sentiment+negative_day_negative_sentiment)}")


positive comments on positive days: 0.5383298462760289
negative comments on positive days: 0.4616701537239712
positive comments on negative days: 0.5368748990478972
negative comments on negative days: 0.4631251009521028


## second analysis

In [7]:
dates = mongo_comments.distinct("next_trading_day")

In [8]:
result = {}
for date in dates:
    comments = mongo_comments.find({"next_trading_day": date})
    positive_sentiment = 0
    negative_sentiment = 0
    neutral_sentiment = 0
    total_sentiment = 0
    text = ""
    for comment in comments:
        text += comment["body"] + " "
        positive_day = comment["next_trading_day_positive"]
        sentiment = comment["sentiment_score"]["compound"]
        total_sentiment += sentiment
        if sentiment > 0:
            positive_sentiment += 1
        elif sentiment < 0:
            negative_sentiment += 1
        else:
            neutral_sentiment += 1
    combined_text_sentiment = sid.polarity_scores(text)
    result[date] = {
        "positive_day": positive_day,
        "positive_sentiment": positive_sentiment,
        "negative_sentiment": negative_sentiment,
        "neutral_sentiment": neutral_sentiment,
        "total_sentiment": total_sentiment,
        "combined_text_sentiment": combined_text_sentiment
    }
    print(f"{positive_day=}, sentiment ratio: {positive_sentiment /(positive_sentiment + negative_sentiment)}, total sentiment: {total_sentiment}, combined text sentiment. {combined_text_sentiment}")

positive_day=True, sentiment ratio: 0.5909090909090909, total sentiment: 10.315999999999999, combined text sentiment. {'neg': 0.129, 'neu': 0.691, 'pos': 0.179, 'compound': 0.9993}
positive_day=False, sentiment ratio: 0.5264437689969604, total sentiment: 31.199599999999855, combined text sentiment. {'neg': 0.145, 'neu': 0.708, 'pos': 0.147, 'compound': 0.9747}
positive_day=True, sentiment ratio: 0.5256140350877193, total sentiment: 31.273799999999973, combined text sentiment. {'neg': 0.128, 'neu': 0.722, 'pos': 0.15, 'compound': 1.0}
positive_day=True, sentiment ratio: 0.5464646464646464, total sentiment: 59.2673999999999, combined text sentiment. {'neg': 0.123, 'neu': 0.73, 'pos': 0.147, 'compound': 1.0}
positive_day=True, sentiment ratio: 0.5522088353413654, total sentiment: 57.59929999999994, combined text sentiment. {'neg': 0.114, 'neu': 0.731, 'pos': 0.155, 'compound': 1.0}
positive_day=True, sentiment ratio: 0.557784911717496, total sentiment: 67.39569999999985, combined text sen

KeyboardInterrupt: 

In [None]:
average_sentiment_positive = 0
average_sentiment_negative = 0
average_sentiment_neutral = 0
negative_days = 0
positive_days = 0

average_combined_sentiment_positive_days = 0
average_combined_sentiment_negative_days = 0

total_sentiment_positive_days = 0
total_sentiment_negative_days = 0

for date in result:
    positive_day = result[date]["positive_day"]
    positive_sentiment = result[date]["positive_sentiment"]
    negative_sentiment = result[date]["negative_sentiment"]
    neutral_sentiment = result[date]["neutral_sentiment"]
    total_sentiment = result[date]["total_sentiment"]
    combined_text_sentiment = result[date]["combined_text_sentiment"]
    if positive_sentiment + negative_sentiment > 0:
        if positive_day:
            positive_days += 1
            average_sentiment_positive += positive_sentiment / (positive_sentiment + negative_sentiment)
            average_combined_sentiment_positive_days += combined_text_sentiment["compound"]
            total_sentiment_positive_days += total_sentiment / (positive_sentiment + negative_sentiment + neutral_sentiment)
        else:
            negative_days += 1
            average_sentiment_negative += positive_sentiment / (positive_sentiment + negative_sentiment)
            average_combined_sentiment_negative_days += combined_text_sentiment["compound"]
            total_sentiment_negative_days += total_sentiment / (positive_sentiment + negative_sentiment + neutral_sentiment)
            
average_sentiment_positive = average_sentiment_positive / positive_days
average_sentiment_negative = average_sentiment_negative / negative_days

average_combined_sentiment_positive_days = average_combined_sentiment_positive_days / positive_days 
average_combined_sentiment_negative_days = average_combined_sentiment_negative_days / negative_days

total_sentiment_positive_days = total_sentiment_positive_days / positive_days
total_sentiment_negative_days = total_sentiment_negative_days / negative_days

print(f"Ammount of positive days: {positive_days}")
print(f"Ammount of negative days: {negative_days}")
print()
print(f"average ratio of positive comments on a positive day: {average_sentiment_positive}")
print(f"average ratio of positive comments on a negative day: {average_sentiment_negative}")
print()
print(f"average sentiment on a positive day: {total_sentiment_positive_days}")
print(f"average sentiment on a negative day: {total_sentiment_negative_days}")
print()
print(f"average combined sentiment on positive days {average_combined_sentiment_positive_days}")
print(f"average combined sentiment on negative days {average_combined_sentiment_negative_days}")

# Trying other methods

## 1. preprocessing

In [34]:
import spacy
nlp = spacy.load('en_core_web_sm')
count = 0
for comment in mongo_comments.find():
    count += 1
    words = []
    for word in comment['body'].split():
        if not (word.startswith('/u/') or word.startswith('/r/')):
            words.append(word)

    doc = nlp(" ".join(words))
    processed = []
    for token in doc:
        #print(token)
        if token.lemma_.isalpha():
            processed.append(token.lemma_)
    
    mongo_comments.update_one({"id": comment["id"],"post_id": comment["post_id"]},{"$set": {"preprocessed_comment": processed}})
    if count % 100 == 0:
        print(f"comment number {count}: {processed}")

    #processed = [token.lemma_ for token in doc if token.lemma_.isalpha() and not (token.is_space or token.like_num or token.like_url or str(token).startswith('/u/')) ]
    
    

comment number 100: ['europe', 'and', 'asia', 'wait', 'til', 'american', 'buyer', 'come', 'in']
comment number 200: ['that', 's', 'complete', 'bullshit', 'mean', 'how', 'do', 'justify', 'sell', 'a', 'day', 'that', 'be', 'not', 'even', 'use', 'the', 'option', 'should', 'either', 'be', 'x', 'percent', 'higher', 'or', 'lower', 'that', 'week', 'and', 'adjust', 'accordingly', 'once', 'hit', 'that', 'holiday', 'or', 'off', 'day', 'that', 'should', 'not', 'be', 'allow', 'but', 'in', 'the', 'end', 'why', 'be', 'option', 'buyer', 'buy', 'week', 'know', 'THEY', 'are', 'short', 'change', 'a', 'full', 'day', 'that', 'make', 'mad', 'will', 'never', 'buy', 'a', 'short', 'week', 'option', 'again', 'but', 'problem', 'be', 'do', 'not', 'cut', 'loss', 'early', 'so', 'that', 'option', 'do', 'not', 'work', 'for', 'usually', 'hold', 'for', 'most', 'of', 'the', 'day', 'or', 'average', 'down', 'before', 'sell', 'so', 'that', 'option', 'really', 'be', 'not', 'in', 'book', 'well', 'just', 'hope', 'to', 'close'