In [1]:
import pandas as pd
import re

import seaborn as sns
import matplotlib.pyplot as plt
import strip_markdown
import warnings
warnings.filterwarnings("ignore")

import pyarrow.parquet as pq


### Data preprocessing
1. Filter comments that contains bot garbe massage. specifically filtering the markdown table format
2. Remove markdown formating string
3. Romove URL
4. Remove returning line "\n"
5. Remove Emoji

In [4]:
sample = pd.read_parquet('data_sample_3w.pq')
sample = sample.rename(columns={'cbody':'Comment Body','title':'Post Title','selftext':'Post Selftxt'})
sample = sample[~sample['Comment Body'].str.contains(':--|:--|:--|:--')] # filter comments containing table

In [5]:
# def extract_text_from_link(link_text):
#     """
#     Removes the link and extracts the text from a given link_text.
#     """
#     # Match the link pattern (e.g., 'dot.com')
#     pattern = r'\[([^]]+)\]\([^)]+\)'

#     # Use re.sub() to replace the link with the extracted text
#     extracted_text = re.sub(pattern, r'\1', link_text)
#     return extracted_text

def remove_urls(text):
    """
    Removes URLs from a given text string.
    """
    return re.sub(r'http\S+', '', text)

# def remove_image_markdown_and_text(text):
#     """
#     Removes the entire image markdown (e.g., !img) from the given text.
#     """
#     pattern = r'!\[.*?\]\(.*?\)'
#     cleaned_text = re.sub(pattern, '', text)
#     return cleaned_text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [6]:
# sample['Comment Body'] = sample['Comment Body'].apply(remove_image_markdown_and_text)
# sample['Comment Body'] = sample['Comment Body'].apply(extract_text_from_link)

sample['Comment Body'] = sample['Comment Body'].apply(strip_markdown.strip_markdown)  # clean all markdown format
sample['Comment Body'] = sample['Comment Body'].apply(remove_urls) # remove urls
sample['Comment Body'] = sample['Comment Body'].str.strip('\n')

# 去除emoji
sample['Post Title'] = sample['Post Title'].apply(remove_emoji)
sample['Post Selftxt'] = sample['Post Selftxt'].apply(remove_emoji)
sample['Comment Body'] = sample['Comment Body'].apply(remove_emoji)
sample[['Post Title','Post Selftxt','Comment Body']].to_excel('Demoji.xlsx')

### Sentiment analysis

1. Tokenize comments by LLM (Huggingface transformers)
2. Use Loughran-McDonald Master Dictionary w/ Sentiment Word Lists (which specifically focus on financial wording) https://sraf.nd.edu/loughranmcdonald-master-dictionary/ to assign sentiment scores and extract positive/negative words.

In [7]:
from pysentiment2_updated import LM # updated pysentiment2 with the lastest version of LM dictionary
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

output = {'Positive cnt':[],'Negative cnt':[],'Polarity':[],'Subjectivity':[],'Positive words':[],'Negative words':[],'Tokenized':[]}
def sentiment_analysis(text):
    lm = LM()
    tokens = tokenizer.tokenize(text)
    score = lm.get_score(tokens)

    return [score['Positive'],score['Negative'],score['Polarity'],score['Subjectivity'],score['Positive words'],score['Negative words'],tokens]

In [12]:
import time
start = time.time()
output = sample.head(500)['Comment Body'].apply(sentiment_analysis)
output = pd.DataFrame(output)
end=time.time()
print(end-start)

108.05680894851685


In [None]:
def get_item(series,index):
    return series[index]

output = sample['Comment Body'].apply(sentiment_analysis)
output = pd.DataFrame(output)

sample['Positive cnt']=output['Comment Body'].apply(get_item, index=0)
sample['Negative cnt'] = output['Comment Body'].apply(get_item, index=1)
sample['Polarity'] = output['Comment Body'].apply(get_item, index=2)
sample['Subjectivity'] = output['Comment Body'].apply(get_item, index=3)
sample['Positive words'] = output['Comment Body'].apply(get_item, index=4)
sample['Negative words'] = output['Comment Body'].apply(get_item, index=5)
sample['Tokenized'] = output['Comment Body'].apply(get_item, index=6)

In [None]:
sample

In [None]:
sns.histplot(sample['Polarity'], bins=20, kde=True)
plt.xlabel("sentiment score of comment")
plt.ylabel("Frequency")
plt.title("Distribution of sentiment score")
plt.show()

In [None]:
sns.histplot(sample['Subjectivity'], bins=20, kde=True)
plt.xlabel("subjectivity of comments")
plt.ylabel("Frequency")
plt.title("Distribution of subjectivity")
plt.show()

In [None]:
# plt.figure(figsize=(8, 6))
# sns.scatterplot(x="Subjectivity", y="Sentiment", data=sample)
# plt.xlabel("subjectivity")
# plt.ylabel("sentiment")
# plt.title("Scatter Plot: subjectivity vs. sentiment")
# plt.show()