In [63]:
import pandas as pd
from transformers import pipeline

# Load the dataset
csv_file_path = r'../data/data_collection/reddit.csv'
df = pd.read_csv(csv_file_path)

# Load the sentiment analysis model
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def annotate_sentiment(text):
    try:
        # Get the result from the classifier
        result = classifier(text)
        # Get the label and score
        label = result[0]['label'].lower()
        score = result[0]['score']

        # Set thresholds for determining 'neutral' sentiment
        neutral_threshold_lower = 0.4
        neutral_threshold_upper = 0.6

        # If the score is not confident, label as 'neutral'
        if neutral_threshold_lower <= score <= neutral_threshold_upper:
            return 'neutral'
        else:
            return label
    except Exception as e:
        print(f"Error processing text: {e}")
        return 'neutral'

# Annotate each post with a sentiment label
df['Sentiment'] = df['Title'].apply(annotate_sentiment)

# Save the annotated dataset to a new CSV file
annotated_file_path = r'../data/data_collection/annotated_dataset.csv'
df.to_csv(annotated_file_path, index=False)

print(f"Annotated dataset saved to {annotated_file_path}")


Annotated dataset saved to ../data/data_collection/annotated_dataset.csv


In [64]:
import pandas as pd

# Load the annotated CSV file
annotated_df = pd.read_csv('../data/data_collection/annotated_dataset.csv')

# Count the number of occurrences of each sentiment label
sentiment_counts = annotated_df['Sentiment'].value_counts()

# Print the count of each sentiment
print(sentiment_counts)


Sentiment
negative    3270
positive    2221
neutral      156
Name: count, dtype: int64


In [105]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from emoji import demojize
import re

# NLTK Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df_annotated = pd.read_csv('../data/data_collection/annotated_dataset.csv')

# Slang dictionary
slang_dict = {
    'btw': 'by the way',
    'lol': 'laughing out loud',
    'idk': 'I do not know',
    'imo': 'in my opinion',
    'imho': 'in my humble opinion',
    'brb': 'be right back',
    'tbh': 'to be honest',
    'lmao': 'laughing my ass off',
    'rofl': 'rolling on the floor laughing',
    'smh': 'shaking my head',
    'omg': 'oh my god',
    'ttyl': 'talk to you later',
    'afaik': 'as far as I know',
    'irl': 'in real life',
    'thx': 'thanks',
    'pls': 'please',
    'dm': 'direct message',
    'fyi': 'for your information',
    'b4': 'before',
    'gr8': 'great',
    'u': 'you',
    'r': 'are',
    'yolo': 'you only live once',
    'np': 'no problem',
    'g2g': 'got to go',
    'tldr': 'too long, didn’t read',
    'jk': 'just kidding',
    'bff': 'best friends forever',
    'icymi': 'in case you missed it',
    'fomo': 'fear of missing out',
    'ftw': 'for the win',
    'wtf': 'what the f***',
    'nsfw': 'not safe for work',
    'nbd': 'no big deal',
    'faq': 'frequently asked questions',
    'afk': 'away from keyboard',
    'asap': 'as soon as possible'
}

# Function to extract and process hashtags
def process_hashtags(text):
    hashtags = re.findall(r'#\w+', text)
    return ' '.join(hashtags)

# Preprocessing function
def preprocess_text(text):
    # Convert emojis to text and process hashtags
    text = demojize(text) + ' ' + process_hashtags(text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and apply lemmatization
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]

    # Translate slang
    tokens = [slang_dict.get(token, token) for token in tokens]

    # Remove non-alphabetic characters and keep the tokens
    tokens = [token for token in tokens if token.isalpha()]

    return ' '.join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/diegobolanos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diegobolanos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/diegobolanos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [106]:
df_annotated['num_words'] = df_annotated['Title'].apply(lambda x: len(x.split()))

In [107]:
# The number of words in the longest post later use 
df_annotated.num_words.max()

62

In [108]:
# Apply preprocessing to each title
df_annotated['Processed_Title'] = df_annotated['Title'].apply(preprocess_text)

In [109]:
# Apply encoding to each sentiment
df_annotated['Sentiment'] = df_annotated['Sentiment'].astype('category')
df_annotated['Sentiment'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 5647 entries, 0 to 5646
Series name: Sentiment
Non-Null Count  Dtype   
--------------  -----   
5647 non-null   category
dtypes: category(1)
memory usage: 5.8 KB


In [110]:
df_annotated['Sentiment'].unique()

['negative', 'positive', 'neutral']
Categories (3, object): ['negative', 'neutral', 'positive']

In [111]:
df_annotated['Sentiment'] = df_annotated['Sentiment'].cat.codes

In [112]:
df_annotated.head()

Unnamed: 0,Title,Subreddit,Author,Likes,Comments,Sentiment,num_words,Processed_Title
0,Texas Supreme Court Rules Against Woman Who So...,news,Lifeboatb,7231,912,0,10,texas supreme court rule woman sought abortion
1,Texas woman who sought court permission for ab...,news,11-110011,24560,2327,0,15,texas woman sought court permission abortion l...
2,Home Alone star gets life-saving cancer surger...,news,ILikeTalkn2Myself,8341,499,0,11,home alone star get cancer surgery raised online
3,Russian opposition leader Navalny missing from...,news,agnesiswitch,12825,742,0,10,russian opposition leader navalny missing pris...
4,Special counsel goes directly to Supreme Court...,news,hamsterberry,5294,484,0,15,special counsel go directly supreme court reso...


In [113]:
# Define the file path to save the preprocessed dataset
preprocessed_file_path = '../data/preprocessing/preprocessed.csv'

# Save the preprocessed dataset
df_annotated.to_csv(preprocessed_file_path, index=False)

print(f"Preprocessed dataset saved to {preprocessed_file_path}")

Preprocessed dataset saved to ../data/preprocessing/preprocessed.csv
