In [11]:
import pandas as pd
import json
import praw
import requests
from datetime import datetime
import csv
import time
import matplotlib.pyplot as plt

In [11]:
# The 3 subreddits we are interested in: r/Bitcoin, r/BitcoinBeginners, r/BitcoinMarkets

## Scraper

In [4]:
def getConfig():
    with open('config.json') as f:
        return json.load(f)
    
config = getConfig()
    
# Setup PRAW
reddit = praw.Reddit(
    client_id=config['client_id'],
    client_secret=config['client_secret'],
    user_agent=config['user_agent']
)


def get_submission_info(submission):
    """Extract desired information from a submission object."""
    return {
        "author": submission.author.name if submission.author else "[deleted]",
        "created_utc": datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
        "distinguished": submission.distinguished,
        "edited": submission.edited if isinstance(submission.edited, bool) else bool(submission.edited),
        "is_original_content": submission.is_original_content,
        "is_self": submission.is_self,
        "link_flair_text": submission.link_flair_text,
        "locked": submission.locked,
        "nsfw": submission.over_18,
        "num_comments": submission.num_comments,
        "permalink": submission.permalink,
        "score": submission.score,
        "selftext": submission.selftext,
        "spoiler": submission.spoiler,
        "stickied": submission.stickied,
        "subreddit": submission.subreddit.display_name,
        "title": submission.title,
        "upvote_ratio": submission.upvote_ratio
    }

def get_top_submissions_and_comments(subreddit_name, start_year, end_year, limit=100, save_interval=10, filename='reddit_comments_data.csv'):
    subreddit = reddit.subreddit(subreddit_name)
    all_data = []
    submission_count = 0

    for year in range(start_year, end_year + 1):
        start_timestamp = int(datetime(year, 1, 1).timestamp())
        end_timestamp = int(datetime(year, 12, 31).timestamp())

        # Iterate over the top submissions
        try:
            for submission in subreddit.top(limit=None):
                if start_timestamp <= submission.created_utc <= end_timestamp:
                    submission_info = get_submission_info(submission)
                    
                    # Extract comments for each submission
                    submission.comments.replace_more(limit=None)
                    for comment in submission.comments.list():
                        comment_info = submission_info.copy()  # Start with submission info
                        comment_info.update({
                            "author": comment.author.name if comment.author else "[deleted]",
                            "created_utc": datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                            "distinguished": comment.distinguished,
                            "edited": comment.edited if isinstance(comment.edited, bool) else bool(comment.edited),
                            "is_original_content": False,  # Comments are not original content
                            "is_self": False,  # Comments are not self-posts
                            "link_flair_text": None,  # Comments don't have link flair
                            "locked": False,  # Comments are not locked
                            "nsfw": False,  # Comments don't have an NSFW flag
                            "num_comments": 0,  # Comments don't have comment counts
                            "permalink": f"https://www.reddit.com{comment.permalink}",
                            "score": comment.score,
                            "selftext": comment.body,
                            "spoiler": False,  # Comments don't have a spoiler flag
                            "stickied": comment.stickied,
                            "subreddit": subreddit_name,
                            "title": submission.title,
                            "upvote_ratio": submission.upvote_ratio  # Using the upvote ratio of the submission
                        })
                        all_data.append(comment_info)
                    
                    submission_count += 1

                    # Periodically save data to CSV
                    if submission_count % save_interval == 0:
                        save_to_csv(all_data, filename)
                        print(f"Saved data for {submission_count} submissions.")
                        all_data.clear()  # Clear the list after saving

                    # Respect Reddit's rate limit
                    time.sleep(1)
            
        except praw.exceptions.APIException as e:
            print(f"API Exception encountered: {e}")
            time.sleep(10)  # Wait and retry
        except Exception as e:
            print(f"An error occurred: {e}")
            time.sleep(5)  # Wait and retry

    # Save any remaining data
    if all_data:
        save_to_csv(all_data, filename)
        print(f"Final save: {submission_count} submissions processed.")

def save_to_csv(data, filename):
    headers = [
        "author", "created_utc", "distinguished", "edited", "is_original_content", 
        "is_self", "link_flair_text", "locked", "nsfw", "num_comments", "permalink", 
        "score", "selftext", "spoiler", "stickied", "subreddit", "title", "upvote_ratio"
    ]
    
    with open(filename, mode='a', newline='', encoding='utf-8') as file:  # Use 'a' mode to append to the file
        writer = csv.DictWriter(file, fieldnames=headers)
        if file.tell() == 0:  # If the file is empty, write the header
            writer.writeheader()
        writer.writerows(data)


subreddit_name = 'BitcoinBeginners' #Change to 'BitcoinBeginners' or 'BitcoinMarkets' to get data for those subreddits
get_top_submissions_and_comments(subreddit_name, 2017, 2024)

Saved data for 10 submissions.
Saved data for 20 submissions.
Saved data for 30 submissions.
Saved data for 40 submissions.
Saved data for 50 submissions.
Saved data for 60 submissions.
Saved data for 70 submissions.
Saved data for 80 submissions.
Saved data for 90 submissions.
Saved data for 100 submissions.
Saved data for 110 submissions.
Saved data for 120 submissions.
Saved data for 130 submissions.
Saved data for 140 submissions.
Saved data for 150 submissions.
Saved data for 160 submissions.
Saved data for 170 submissions.
Saved data for 180 submissions.
Saved data for 190 submissions.
Saved data for 200 submissions.
Saved data for 210 submissions.
Saved data for 220 submissions.
Saved data for 230 submissions.
Saved data for 240 submissions.
Saved data for 250 submissions.
Saved data for 260 submissions.
Saved data for 270 submissions.
Saved data for 280 submissions.
Saved data for 290 submissions.
Saved data for 300 submissions.
Saved data for 310 submissions.
Saved data for 32

In [4]:
reddit_bitcoin = pd.read_excel('ANL488_Git/Data/reddit_comments_data_bitcoin.xlsx')

reddit_bitcoinBeginners = pd.read_csv('ANL488_Git/Data/reddit_comments_data_bitcoinbeginners.csv')

reddit_bitcoinMarkets = pd.read_csv('ANL488_Git/Data/reddit_comments_data_bitcoinmarkets.csv')

combined_reddit = pd.concat([reddit_bitcoin, reddit_bitcoinBeginners, reddit_bitcoinMarkets])

# see how many rows and columns are in the dataframe
combined_reddit.shape

(562196, 18)

In [7]:
# turn the 'created_utc' column into a datetime object
combined_reddit['created_utc'] = pd.to_datetime(combined_reddit['created_utc'])

In [9]:
date = combined_reddit['created_utc'].dt.date
time = combined_reddit['created_utc'].dt.time

# create a new column for the date
combined_reddit['date'] = date

# create a new column for the time
combined_reddit['time'] = time

combined_reddit.head(5)

Unnamed: 0,author,created_utc,distinguished,edited,is_original_content,is_self,link_flair_text,locked,nsfw,num_comments,permalink,score,selftext,spoiler,stickied,subreddit,title,upvote_ratio,date,time
0,SPOKANARCHY,2017-11-29 09:49:15,,False,False,False,,False,False,0,https://www.reddit.com/r/Bitcoin/comments/7g9c...,2472,Where‚Äôs the guy that‚Äôs going to eat his le...,False,False,Bitcoin,"It's official! 1 Bitcoin = $10,000 USD",0.81,2017-11-29,09:49:15
1,TarAldarion,2017-11-29 09:48:31,,False,False,False,,False,False,0,https://www.reddit.com/r/Bitcoin/comments/7g9c...,6798,It's official. 100 million dollar pizza.,False,False,Bitcoin,"It's official! 1 Bitcoin = $10,000 USD",0.81,2017-11-29,09:48:31
2,walloon5,2017-11-29 09:31:00,,False,False,False,,False,False,0,https://www.reddit.com/r/Bitcoin/comments/7g9c...,2282,Wooooo!!!!\n\nI watched the wall fall on GDAX ...,False,False,Bitcoin,"It's official! 1 Bitcoin = $10,000 USD",0.81,2017-11-29,09:31:00
3,flclst3v3,2017-11-29 09:27:14,,False,False,False,,False,False,0,https://www.reddit.com/r/Bitcoin/comments/7g9c...,2166,See you at 15k,False,False,Bitcoin,"It's official! 1 Bitcoin = $10,000 USD",0.81,2017-11-29,09:27:14
4,ditto755,2017-11-29 09:40:44,,False,False,False,,False,False,0,https://www.reddit.com/r/Bitcoin/comments/7g9c...,1672,[Full Celebration](https://www.youtube.com/wat...,False,False,Bitcoin,"It's official! 1 Bitcoin = $10,000 USD",0.81,2017-11-29,09:40:44


## Getting sentiment score

In [11]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import TextBlob
import emoji
from datetime import datetime
import re
from transformers import  BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader
import torch
from sklearn.model_selection import train_test_split
import accelerate

In [12]:
# downloading the stopwords and supporting libraries for text preprocessing
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bokyannchou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/bokyannchou/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
# choosing only required columns
final_data_sentiment = combined_reddit[['date','time','selftext', 'title']]

In [14]:
# make sure the 'selftext' column is a string
final_data_sentiment['selftext'] = final_data_sentiment['selftext'].astype(str)
final_data_sentiment['title'] = final_data_sentiment['title'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_sentiment['selftext'] = final_data_sentiment['selftext'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_sentiment['title'] = final_data_sentiment['title'].astype(str)


In [16]:
# Preprocess the text
def preprocess_text(text):

    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # lower the text
    text = text.lower()
    return text

final_data_sentiment['selftext_cleaned'] = final_data_sentiment['selftext'].apply(preprocess_text)
final_data_sentiment['title_cleaned'] = final_data_sentiment['title'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_sentiment['selftext_cleaned'] = final_data_sentiment['selftext'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_sentiment['title_cleaned'] = final_data_sentiment['title'].apply(preprocess_text)


In [19]:
# final_data_sentiment date range
final_data_sentiment['date'].min(), final_data_sentiment['date'].max()

(datetime.date(2017, 1, 4), datetime.date(2024, 9, 8))

In [26]:
final_data_sentiment['hour'] = final_data_sentiment['time'].apply(lambda x: x.hour)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_sentiment['hour'] = final_data_sentiment['time'].apply(lambda x: x.hour)


In [42]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import Dataset
import torch
import logging

# Configure logging to track errors
logging.basicConfig(filename='error_log.log', level=logging.ERROR)

# Use a lighter, faster model (DistilBERT)
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Sentiment-analysis pipeline
device = 0 if torch.cuda.is_available() else -1
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)


# Combine title and comment
def combine_title_comment(row):
    return f"{row['title_cleaned']} [SEP] {row['selftext_cleaned']}"

final_data_sentiment['combined_text'] = final_data_sentiment.apply(combine_title_comment, axis=1)

# Combine date and time to form datetime, both is already in datetime format
final_data_sentiment['datetime'] = final_data_sentiment.apply(lambda x: datetime.combine(x['date'], x['time']), axis=1)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(final_data_sentiment[['combined_text', 'datetime']])

# Truncate input length to 128 tokens for faster processing
def tokenize_function(batch):
    return tokenizer(batch['combined_text'], truncation=True, padding=True, max_length=128)

# Tokenize the dataset
dataset = dataset.map(tokenize_function, batched=True, batch_size=128)

# Perform sentiment analysis with batch processing and error handling
def analyze_sentiment(batch):
    try:
        results = sentiment_pipeline(batch['combined_text'])
        batch['sentiment'] = [res['label'] for res in results]
        batch['score'] = [res['score'] for res in results]
    except Exception as e:
        logging.error(f"Sentiment analysis error: {e}")
        batch['sentiment'] = ["ERROR"] * len(batch['combined_text'])
        batch['score'] = [0.0] * len(batch['combined_text'])
    return batch

# Process in larger batches (increase batch size to 64 or 128)
dataset = dataset.map(analyze_sentiment, batched=True, batch_size=64)

# Aggregate sentiment by date
df_results = pd.DataFrame(dataset)

# Handle rows where there were errors in sentiment analysis
df_results = df_results[df_results['sentiment'] != 'ERROR']

# Aggregate sentiment by date and hour
df_results['datetime'] = pd.to_datetime(df_results['datetime'])
df_results['hour'] = df_results['datetime'].dt.hour
df_results['date'] = df_results['datetime'].dt.date

df_agg = df_results.groupby(['date', 'hour']).agg(
    sentiment_score_avg=('score', 'mean'),
    positive_count=('sentiment', lambda x: sum(1 for label in x if label == 'POSITIVE')),
    negative_count=('sentiment', lambda x: sum(1 for label in x if label == 'NEGATIVE'))
).reset_index()

print(df_agg)

# Save the results
df_agg.to_csv('Data/reddit_sentiment_analysis.csv', index=False)

#save as parquet file as well
df_agg.to_parquet('Data/reddit_sentiment_analysis.parquet', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_sentiment['combined_text'] = final_data_sentiment.apply(combine_title_comment, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data_sentiment['datetime'] = final_data_sentiment.apply(lambda x: datetime.combine(x['date'], x['time']), axis=1)


Map:   0%|          | 0/562196 [00:00<?, ? examples/s]

Map:   0%|          | 0/562196 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2647 > 512). Running this sequence through the model will result in indexing errors


             date  hour  sentiment_score_avg  positive_count  negative_count
0      2017-01-04    12             0.949070              11              21
1      2017-01-04    13             0.935518              22              50
2      2017-01-04    14             0.906919              13              22
3      2017-01-04    15             0.955167               5              12
4      2017-01-04    16             0.956778               7              26
...           ...   ...                  ...             ...             ...
28071  2024-09-07    22             0.998383               0               3
28072  2024-09-07    23             0.997304               0               6
28073  2024-09-08     2             0.999295               0               1
28074  2024-09-08     3             0.994338               0               3
28075  2024-09-08     6             0.999112               0               1

[28076 rows x 5 columns]
