In [1]:
import pandas as pd
import re

import seaborn as sns

import matplotlib.pyplot as plt
import strip_markdown
import warnings
warnings.filterwarnings("ignore")

import pyarrow.parquet as pq

import functools

### Data preprocessing
1. Filter comments that contains bot garbe massage. specifically filtering the markdown table format
2. Remove markdown formating string
3. Romove URL
4. Remove returning line "\n"
5. Remove Emoji

In [2]:
import glob
import os
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/data"
csv_files = glob.glob(os.path.join(folder_path, "*.pq"))

dataframes = []
for csv_file in csv_files:
    df = pd.read_parquet(csv_file)
    file_name = csv_file.split("/")[-1]
    file_name = file_name.split(".pq")[0]
    if "new" in file_name:
        df['subreddit'] = file_name.split("_")[0]
        df['topic'] = "all"
    else:
        df['subreddit'] = "WSB"
        if "bets" in file_name:
            df['topic'] = file_name.split("_")[1]
        else:
            df['topic'] = file_name.split("_")[0]
    dataframes.append(df)

In [3]:
sample = pd.concat(dataframes).drop(['distinguished'],axis=1)
sample = sample.rename(columns={'cbody':'Comment Body','title':'Post Title','selftext':'Post Selftxt'})
sample = sample[~sample['Comment Body'].isna()]

In [7]:
for i in sample['Comment Body'].head(2):
    print(i)

They faked it? Maybe they had my girlfriend help them as she is great at faking it.
Fake it till you make it, right? ![img](emote|t5_2th52|4260)


In [4]:
def remove_urls(text):
    """
    Removes URLs from a given text string.
    """
    return re.sub(r'http\S+', '', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [5]:
sample['Comment Body'] = sample['Comment Body'].apply(strip_markdown.strip_markdown)  # clean all markdown format
sample['Comment Body'] = sample['Comment Body'].apply(remove_urls) # remove urls
sample['Comment Body'] = sample['Comment Body'].str.strip('\n')

# 去除emoji
sample['Post Title'] = sample['Post Title'].apply(remove_emoji)
sample['Post Selftxt'] = sample['Post Selftxt'].apply(remove_emoji)
sample['Comment Body'] = sample['Comment Body'].apply(remove_emoji)
# sample[['Post Title','Post Selftxt','Comment Body']].to_excel('Demoji.xlsx')

### Sentiment analysis

1. Tokenize comments by LLM (Huggingface transformers)
2. Use Loughran-McDonald Master Dictionary w/ Sentiment Word Lists (which specifically focus on financial wording) https://sraf.nd.edu/loughranmcdonald-master-dictionary/ to assign sentiment scores and extract positive/negative words.

In [6]:
from pysentiment2_updated import LM # updated pysentiment2 with the lastest version of LM dictionary
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# output = {'Positive cnt':[],'Negative cnt':[],'Polarity':[],'Subjectivity':[],'Positive words':[],'Negative words':[],'Tokenized':[]}
lm = LM()
def sentiment_analysis(text):

    tokens = tokenizer.tokenize(text)
    score = lm.get_score(tokens)
    return [score['Positive'],score['Negative'],score['Polarity'],score['Subjectivity'],score['Positive words'],score['Negative words'],score['tokens']]

In [7]:
import time
start = time.time()
output = sample['Comment Body'].apply(sentiment_analysis)
output = pd.DataFrame(output)

def get_item(series,index):
    return series[index]

sample['Positive cnt']=output['Comment Body'].apply(get_item, index=0)
sample['Negative cnt'] = output['Comment Body'].apply(get_item, index=1)
sample['Polarity'] = output['Comment Body'].apply(get_item, index=2)
sample['Subjectivity'] = output['Comment Body'].apply(get_item, index=3)
sample['Positive words'] = output['Comment Body'].apply(get_item, index=4)
sample['Negative words'] = output['Comment Body'].apply(get_item, index=5)
sample['Tokenized'] = output['Comment Body'].apply(get_item, index=6)

Token indices sequence length is longer than the specified maximum sequence length for this model (925 > 512). Running this sequence through the model will result in indexing errors


In [8]:
sample[['id','Post Title','time','Positive cnt','Comment Body','cnum_replies','subreddit', 'topic',\
        'Negative cnt','Polarity','Subjectivity','Positive words','Negative words'\
        ,'Tokenized']].to_parquet('output/data_sample_3w_output.pq')


In [10]:
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/stock_data"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

stock_data = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df['company'] = csv_file.split("/")[-1].strip(".csv")
    stock_data.append(df)

In [11]:
stock_price = pd.concat(stock_data)

In [12]:
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/data"
csv_files = glob.glob(os.path.join(folder_path, "*.pq"))

dataframes = []
for csv_file in csv_files:
    file_name = csv_file.split("/")[-1]
    if file_name[:4]=="bets":
        topic = file_name.strip(".pq").split("_")[1].upper()
        
#     topic = file_name.strip(".pq").split("_")[1]
        df = pd.read_parquet(csv_file)
#     df['subreddit'] = subreddit
#     df['topic'] = topic
        text_data= df['cbody'][:1000].str.cat()
        f = open("output/"+topic+"_wordcloud.txt", "a")
        f.write(text_data)
        f.close()

In [13]:
df = pd.read_parquet('output/data_sample_3w_output.pq')
df = df.rename(columns={'time':'Date'})
df['Date'] = [x.strftime("%Y-%m-%d") for x in df['Date']]
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/stock_data"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
stock_data = []
company = []
for csv_file in csv_files:
    stock_df = pd.read_csv(csv_file)[['Date','Close']]
    company_name = csv_file.split("/")[-1].strip(".csv")
    company.append(company_name)
    stock_df = stock_df.rename(columns={"Close":"StockPrice"})
#     stock_df['Date'] = [x.strftime("%Y-%m-%d") for x in df['Date']]

    stock_df['company'] = csv_file.split("/")[-1].strip(".csv")
    stock_data.append(stock_df)
#     df = df.merge(stock_df,how="left",on="Date")
stock_price = pd.concat(stock_data)

In [14]:
stock_price.to_parquet("output/stock_price.parquet")

In [15]:
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/data"
csv_files = glob.glob(os.path.join(folder_path, "*.pq"))

bet_company = []
for csv_file in csv_files:
    file_name = csv_file.split("/")[-1]
#     if file_name[:4]=="bets":
    company_name = csv_file.split("/")[-1].strip(".pq")
    if "new" not in company_name:
        bet_company.append(company_name.split("_"))
        

Gemini_sample
bets_ai_search
Google_sample
OpenAI_new
Microsoft_sample
bets_nvda_search
AMD_sample
Sora_sample
NVDA_Stock_new
ChatGPT_sample


In [68]:
df = pd.read_parquet("output/data_sample_3w_output.pq") 
words = df[df['subreddit']=="WSB"][['subreddit','topic','Positive words','Negative words']]
topic_words_str = {"topic":[],"pos_word":[],'neg_word':[]}

# topic_data = {
# }
# for i in subreddits:
#     topic_data[i] = list(df[df['subreddit']==i]['Polarity'])
def word_str(list):
    return " ".join(list)
words['pos_str'] = words['Positive words'].apply(word_str)
words['neg_str'] = words['Negative words'].apply(word_str)

for i in set(words['topic']):
    w = words[words['topic']==i]
    pos = word_str(list(w['pos_str']))
    pos = re.sub(r'\s+', ' ', pos)
    neg = word_str(list(w['neg_str']))
    neg = re.sub(r'\s+', ' ', neg)
    topic_words_str['topic'].append(i)
    topic_words_str['pos_word'].append(pos)
    topic_words_str['neg_word'].append(neg)

In [69]:
pd.DataFrame(topic_words_str).to_parquet("output/wsb_topic_words.parquet")

In [70]:
pd.DataFrame(topic_words_str)

Unnamed: 0,topic,pos_word,neg_word
0,Microsoft,leadership reward better great superior easie...,break bad bubble bad fail shut stop lose ques...
1,AMD,enjoy profit profit reward better profit prof...,er loss contract bad lose correct ill er lose...
2,ChatGPT,better great loyal better better great better...,ill bad stop shut fake bad argument slow argu...
3,ai,profit better boom lead progress profit stron...,bubble bubble bubble bubble bubble bubble los...
4,Sora,great dream revolution lead superior better p...,worst lose bubble bubble fake bankrupt conten...
5,Gemini,great better profit better better revolution b...,fake fake lie fraud fake stop fake lose lose l...
6,nvda,profit profit great profit profit great profi...,resign scandal drop drop lose contract questi...
7,Google,better win profit win win great better streng...,content suffer stop content loss bad fire pro...
