In [1]:
import pandas as pd
import re

import seaborn as sns

import matplotlib.pyplot as plt
import strip_markdown
import warnings
warnings.filterwarnings("ignore")

import pyarrow.parquet as pq

### Data preprocessing
1. Filter comments that contains bot garbe massage. specifically filtering the markdown table format
2. Remove markdown formating string
3. Romove URL
4. Remove returning line "\n"
5. Remove Emoji

In [5]:
import glob
import os
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/data"
csv_files = glob.glob(os.path.join(folder_path, "*.pq"))

dataframes = []
for csv_file in csv_files:
    df = pd.read_parquet(csv_file)
    file_name = csv_file.split("/")[-1]
    file_name = file_name.split(".pq")[0]
    if "new" in file_name:
        df['subreddit'] = file_name.split("_")[0]
        df['topic'] = "all"
    else:
        df['subreddit'] = "WSB"
        if "bets" in file_name:
            df['topic'] = file_name.split("_")[1]
        else:
            df['topic'] = file_name.split("_")[0]
    dataframes.append(df)

In [6]:
sample = pd.concat(dataframes).drop(['distinguished'],axis=1)
sample = sample.rename(columns={'cbody':'Comment Body','title':'Post Title','selftext':'Post Selftxt'})
sample = sample[~sample['Comment Body'].isna()]
sample

Unnamed: 0,id,Post Title,author,time,Post Selftxt,score,upvote_ratio,num_comments,num_top_level_comments,subreddit_id,...,clink_id_pid,ctime,Comment Body,cdistinguished,cscore,cnum_replies,csubreddit_id_t,csubreddit_id_sid,subreddit,topic
1,18dkkml,Google's best Gemini demo was faked,xoxoxoxoxo,2023-12-08,It does amaze me how Alphabet dropped the ball...,831,0.89,225,46,2th52,...,18dkkml,2023-12-08,They faked it? Maybe they had my girlfriend he...,,576.0,15.0,t5,2th52,WSB,Gemini
2,18dkkml,Google's best Gemini demo was faked,xoxoxoxoxo,2023-12-08,It does amaze me how Alphabet dropped the ball...,831,0.89,225,46,2th52,...,18dkkml,2023-12-08,"Fake it till you make it, right? ![img](emote|...",,210.0,16.0,t5,2th52,WSB,Gemini
3,18dkkml,Google's best Gemini demo was faked,xoxoxoxoxo,2023-12-08,It does amaze me how Alphabet dropped the ball...,831,0.89,225,46,2th52,...,18dkkml,2023-12-08,That is not quacktastic,,65.0,2.0,t5,2th52,WSB,Gemini
4,18dkkml,Google's best Gemini demo was faked,xoxoxoxoxo,2023-12-08,It does amaze me how Alphabet dropped the ball...,831,0.89,225,46,2th52,...,18dkkml,2023-12-08,Oof. To have to make shit up to look impressiv...,,122.0,23.0,t5,2th52,WSB,Gemini
5,18dkkml,Google's best Gemini demo was faked,xoxoxoxoxo,2023-12-08,It does amaze me how Alphabet dropped the ball...,831,0.89,225,46,2th52,...,18dkkml,2023-12-09,Sounds like fraud to me. To manipulate invest...,,7.0,0.0,t5,2th52,WSB,Gemini
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21410,zv32ac,Lumber prices back to normal levels,,2022-12-26,,11016,0.98,659,188,2th52,...,zv32ac,2022-12-26,We always said that about gas stations. The s...,,4.0,0.0,t5,2th52,WSB,ChatGPT
21411,zv32ac,Lumber prices back to normal levels,,2022-12-26,,11016,0.98,659,188,2th52,...,zv32ac,2022-12-26,Go cut a tree down and make lumber yourself th...,,1.0,2.0,t5,2th52,WSB,ChatGPT
21412,zv32ac,Lumber prices back to normal levels,,2022-12-26,,11016,0.98,659,188,2th52,...,zv32ac,2022-12-26,I would fucking love to get my hands on some e...,,2.0,0.0,t5,2th52,WSB,ChatGPT
21413,zv32ac,Lumber prices back to normal levels,,2022-12-26,,11016,0.98,659,188,2th52,...,zv32ac,2022-12-26,I fucking did. Stole a Chrismas tree… \nI am M...,,6.0,0.0,t5,2th52,WSB,ChatGPT


In [7]:
def remove_urls(text):
    """
    Removes URLs from a given text string.
    """
    return re.sub(r'http\S+', '', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [8]:
sample['Comment Body'] = sample['Comment Body'].apply(strip_markdown.strip_markdown)  # clean all markdown format
sample['Comment Body'] = sample['Comment Body'].apply(remove_urls) # remove urls
sample['Comment Body'] = sample['Comment Body'].str.strip('\n')

# 去除emoji
sample['Post Title'] = sample['Post Title'].apply(remove_emoji)
sample['Post Selftxt'] = sample['Post Selftxt'].apply(remove_emoji)
sample['Comment Body'] = sample['Comment Body'].apply(remove_emoji)
# sample[['Post Title','Post Selftxt','Comment Body']].to_excel('Demoji.xlsx')

### Sentiment analysis

1. Tokenize comments by LLM (Huggingface transformers)
2. Use Loughran-McDonald Master Dictionary w/ Sentiment Word Lists (which specifically focus on financial wording) https://sraf.nd.edu/loughranmcdonald-master-dictionary/ to assign sentiment scores and extract positive/negative words.

In [9]:
from pysentiment2_updated import LM # updated pysentiment2 with the lastest version of LM dictionary
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

output = {'Positive cnt':[],'Negative cnt':[],'Polarity':[],'Subjectivity':[],'Positive words':[],'Negative words':[],'Tokenized':[]}
lm = LM()
def sentiment_analysis(text):

    tokens = tokenizer.tokenize(text)
    score = lm.get_score(tokens)

    return [score['Positive'],score['Negative'],score['Polarity'],score['Subjectivity'],score['Positive words'],score['Negative words'],tokens]

In [10]:
import time
start = time.time()
output = sample['Comment Body'].apply(sentiment_analysis)
output = pd.DataFrame(output)
end=time.time()
print(end-start)

Token indices sequence length is longer than the specified maximum sequence length for this model (925 > 512). Running this sequence through the model will result in indexing errors


341.16359996795654


In [11]:
def get_item(series,index):
    return series[index]

# output = sample['Comment Body'].apply(sentiment_analysis)
# output = pd.DataFrame(output)

sample['Positive cnt']=output['Comment Body'].apply(get_item, index=0)
sample['Negative cnt'] = output['Comment Body'].apply(get_item, index=1)
sample['Polarity'] = output['Comment Body'].apply(get_item, index=2)
sample['Subjectivity'] = output['Comment Body'].apply(get_item, index=3)
sample['Positive words'] = output['Comment Body'].apply(get_item, index=4)
sample['Negative words'] = output['Comment Body'].apply(get_item, index=5)
sample['Tokenized'] = output['Comment Body'].apply(get_item, index=6)

In [12]:
sample[['id','Post Title','time','Positive cnt','Comment Body','cnum_replies','subreddit', 'topic',\
        'Negative cnt','Polarity','Subjectivity','Positive words','Negative words'\
        ,'Tokenized']].to_parquet('output/data_sample_3w_output.pq')


In [None]:
sample

In [None]:
sns.histplot(sample['Polarity'], bins=20, kde=True)
plt.xlabel("sentiment score of comment")
plt.ylabel("Frequency")
plt.title("Distribution of sentiment score")
plt.show()

In [None]:
sns.histplot(sample['Subjectivity'], bins=20, kde=True)
plt.xlabel("subjectivity of comments")
plt.ylabel("Frequency")
plt.title("Distribution of subjectivity")
plt.show()

In [None]:
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/stock_data"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

stock_data = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df['company'] = csv_file.split("/")[-1].strip(".csv")
    stock_data.append(df)

In [None]:
stock_price = pd.concat(stock_data)

In [None]:
table=pq.read_table('data/Chatgpt_sample.pq').to_pandas()
text_data= table['cbody'][:1000].str.cat()

In [None]:
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/data"
csv_files = glob.glob(os.path.join(folder_path, "*.pq"))

dataframes = []
for csv_file in csv_files:
    file_name = csv_file.split("/")[-1]
    if file_name[:4]=="bets":
        topic = file_name.strip(".pq").split("_")[1].upper()
        
#     topic = file_name.strip(".pq").split("_")[1]
        df = pd.read_parquet(csv_file)
#     df['subreddit'] = subreddit
#     df['topic'] = topic
        text_data= df['cbody'][:1000].str.cat()
        f = open("output/"+topic+"_wordcloud.txt", "a")
        f.write(text_data)
        f.close()

In [None]:
df = pd.read_parquet('output/data_sample_3w_output.pq')
df = df.rename(columns={'time':'Date'})
df['Date'] = [x.strftime("%Y-%m-%d") for x in df['Date']]
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/stock_data"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
stock_data = []
company = []
for csv_file in csv_files:
    stock_df = pd.read_csv(csv_file)[['Date','Close']]
    company_name = csv_file.split("/")[-1].strip(".csv")
    company.append(company_name)
    stock_df = stock_df.rename(columns={"Close":"StockPrice"})
#     stock_df['Date'] = [x.strftime("%Y-%m-%d") for x in df['Date']]

    stock_df['company'] = csv_file.split("/")[-1].strip(".csv")
    stock_data.append(stock_df)
#     df = df.merge(stock_df,how="left",on="Date")
stock_price = pd.concat(stock_data)

In [None]:
stock_price.to_parquet("output/stock_price.parquet")

In [None]:
folder_path = "/Users/chelseayeung/Documents/MFIN7036-Group-Project/Analysis/data"
csv_files = glob.glob(os.path.join(folder_path, "*.pq"))

bet_company = []
for csv_file in csv_files:
    file_name = csv_file.split("/")[-1]
#     if file_name[:4]=="bets":
    company_name = csv_file.split("/")[-1].strip(".pq")
    print(company_name)
    if "new" not in company_name:
        bet_company.append(company_name.split("_"))
        

In [None]:
bet_company

In [None]:
['Gemini', 'Google','Microsoft','nvda','AMD','Sora','ChatGPT',]