In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.svm import SVC
import string
import random
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from sklearn.inspection import permutation_importance

In [None]:
size = 10000

# Load in the two datasets
comments_neg = pd.read_csv("data/comments_negative.csv")
comments_pos = pd.read_csv("data/comments_positive.csv")

# Sample 20,000 tweets
comments_neg = comments_neg.sample(size)
comments_pos = comments_pos.sample(size)

# Remove invalid entries
comments_neg = comments_neg[~pd.isna(comments_neg['text'])]
comments_pos = comments_pos[~pd.isna(comments_pos['text'])]

In [None]:
# Visualize mean and standard deviation of scores of comments
pos_stats = [comments_pos['score'].mean(), comments_pos['score'].std()]
neg_stats = [abs(comments_neg['score'].mean()), abs(comments_neg['score'].std())]

bar_pos_1 = np.arange(len(pos_stats))
bar_pos_2 = bar_pos_1 + 0.25

plt.bar(bar_pos_1, pos_stats, color = 'b', width = 0.25, label = 'Positive')
plt.bar(bar_pos_2, neg_stats, color = 'r', width = 0.25, label = 'Negative')
plt.xticks([i + 0.125 for i in range(len(pos_stats))], ['mean', 'standard deviation'])
plt.ylabel("Value")
plt.yticks(list(range(0, 250, 20)))
plt.legend()
plt.xlabel("Statistic")
plt.title("Reddit Comment Scores’ Analysis")
# plt.savefig("stats")
plt.show()




In [None]:
# Visualize boxplots of comment scores
boxplot_data = [comments_pos['score'], comments_neg['score'].abs()]

plt.figure(figsize = (8,5))
plt.boxplot(boxplot_data, patch_artist = True, showfliers = False, labels = ["Positive", "Negative"])
plt.yticks(list(range(0, 400, 20)))
plt.title("Reddit Comment Scores' Analysis")
# plt.savefig("boxplot")
plt.show()

In [None]:
# Store stopwords and punctuation to be removed
stopwords_eng = set(stopwords.words('english'))
punctuation = list(string.punctuation)

# Function that returns list of unigrams from sentence after pre-processing it
def tokenizer(sentences):
    unigrams = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        unigrams.extend([word for word in words if word not in stopwords_eng and word not in punctuation])

    return unigrams

# Generate list of all unigrams
unigrams = tokenizer(comments_pos['text'].tolist())
unigrams.extend(tokenizer(comments_neg['text'].tolist()))

In [None]:
# Build the vocabulary using a threshold of 5
def build_vocab(unigrams):
    unigrams = [word.lower() for word in unigrams]
    unigram_counts = dict(Counter(unigrams))
    vocab = []
    for unigram, unigram_count in unigram_counts.items():
        if unigram_count > 5:
            vocab.append(unigram)
    return vocab

vocab = build_vocab(unigrams)

In [None]:
# Visualize texts using Wordclouds
positive_text_str = " ".join(tokenizer(comments_pos['text'].tolist()))
negative_text_str = " ".join(tokenizer(comments_neg['text'].tolist()))

plt.figure(figsize=(10,10))
plt.title("Positive Comments")
plt.imshow(WordCloud(max_font_size=40).generate(positive_text_str))

plt.figure(figsize=(10,10))
plt.title("Negative Comments")
plt.imshow(WordCloud(max_font_size=40).generate(negative_text_str))

In [None]:
sent_analyzer = SentimentIntensityAnalyzer()
sent_scores = [[],[],[]]

negative_comments_list = comments_neg['text'].tolist()

# Store sentiment scores for plots
for neg_comment in negative_comments_list:
    sent_analysis = sent_analyzer.polarity_scores(neg_comment)
    sent_scores[0].append(sent_analysis["neg"])
    sent_scores[1].append(sent_analysis["neu"])
    sent_scores[2].append(sent_analysis["pos"])

# Store sentiment score means for plots
means = [sum(x)/len(x) for x in sent_scores]


# Plot the negative comments data
fig, axs = plt.subplots(1,3, figsize = (16,5))

axs[0].hist(sent_scores[0])
axs[0].set_title(f"Negativity \n(μ = {round(means[0], 3)})")
axs[1].hist(sent_scores[1])
axs[1].set_title(f"Neutrality \n(μ = {round(means[1], 3)})")
axs[2].hist(sent_scores[2])
axs[2].set_title(f"Positivity \n(μ = {round(means[2], 3)})")

for ax in axs:
    ax.set_xlabel("Sentiment Score")
    ax.set_ylabel("Count")
    ax.set_xticks((0,0.5,1))
    ax.label_outer()

fig.suptitle("Negative Comments", y=1.05)
# fig.savefig("neg_sent")
fig.show()


In [None]:
# Plot the positive comments data
sent_analyzer = SentimentIntensityAnalyzer()
sent_scores = [[],[],[]]

positive_comments_list = comments_pos['text'].tolist()

for pos_comment in positive_comments_list:
    sent_analysis = sent_analyzer.polarity_scores(pos_comment)
    sent_scores[0].append(sent_analysis["neg"])
    sent_scores[1].append(sent_analysis["neu"])
    sent_scores[2].append(sent_analysis["pos"])

means = [sum(x)/len(x) for x in sent_scores]

fig, axs = plt.subplots(1,3, figsize = (16,5))

axs[0].hist(sent_scores[0])
axs[0].set_title(f"Negativity \n(μ = {round(means[0], 3)})")
axs[1].hist(sent_scores[1])
axs[1].set_title(f"Neutrality \n(μ = {round(means[1], 3)})")
axs[2].hist(sent_scores[2])
axs[2].set_title(f"Positivity \n(μ = {round(means[2], 3)})")

for ax in axs:
    ax.set_xlabel("Sentiment Score")
    ax.set_ylabel("Count")
    ax.set_xticks((0,0.5,1))
    ax.label_outer()

fig.suptitle("Positive Comments", y=1.05)
# fig.savefig("pos_sent")
fig.show()

In [None]:
# Function that generates bag of words representation for sentences
def generate_bow(sentences):
    k = 0
    feature_matrix = []
    for sentence in sentences:
        if (k % 10000 == 0):
            print(k)
        sentence_words = word_tokenize(sentence)
        sentence_words = [word for word in sentence_words if word not in stopwords_eng and word not in punctuation]
        bow_vector = [0] * len(vocab)
        for word in sentence_words:
            for i,vocab_word in enumerate(vocab):
                if word == vocab_word:
                    bow_vector[i] += 1
        feature_matrix.append(bow_vector)
        k += 1
    return feature_matrix

all_comments = comments_pos['text'].tolist()
all_comments.extend(comments_neg['text'].tolist())

feature_matrix = generate_bow(all_comments)

In [None]:
# Generate labels (1 if positive, -1 if negative)
labels = np.ones(len(feature_matrix))
comments_df = pd.concat([comments_pos, comments_neg])

# Add meta features
feature_names = vocab.copy()
meta_features = ["controversiality", "pos_sent", "neg_sent", "parent_pos_sent", 
                 "parent_neg_sent", "parent_score", "parent_controversiality"]
feature_names.extend(meta_features)

sent_analyzer = SentimentIntensityAnalyzer()

# Add meta features to feature matrix
for i,row_tuple in enumerate(comments_df.iterrows()):
    row = row_tuple[1]
    if row['score'] < 0:
        labels[i] = -1
    feature_matrix[i].append(int(row['controversiality']))
    comment_sent_scores = sent_analyzer.polarity_scores(row['text'])
    feature_matrix[i].append(comment_sent_scores["pos"])
    feature_matrix[i].append(comment_sent_scores["neg"])
    parent_sent_scores = sent_analyzer.polarity_scores(row['parent_text'])
    feature_matrix[i].append(parent_sent_scores["pos"])
    feature_matrix[i].append(parent_sent_scores["neg"])
    feature_matrix[i].append(int(row['parent_score']))
    feature_matrix[i].append(int(row['parent_controversiality']))

In [None]:
# Generate train and test sets

labels = [int(label) for label in labels]
zipped = list(zip(feature_matrix, labels))
random.shuffle(zipped)
split_point = int(0.8*len(zipped))
train_zipped = zipped[:split_point]
test_zipped = zipped[split_point:]
x_train, y_train = zip(*train_zipped)
x_test, y_test = zip(*test_zipped)

In [None]:
# majority baseline
print("Majority Baseline")
y_pred = [1] * len(y_test)
print("f1 score:", f1_score(y_test, y_pred))
print("precision score:", precision_score(y_test, y_pred))
print("recall score:", recall_score(y_test, y_pred))
print("accuracy score:", accuracy_score(y_test, y_pred))

# random baseline
print("Random Baseline")
y_pred = np.random.randint(0,2,len(y_test))
y_pred = [-1 if x == 0 else 1 for x in y_pred ]
print("f1 score:", f1_score(y_test, y_pred))
print("precision score:", precision_score(y_test, y_pred))
print("recall score:", recall_score(y_test, y_pred))
print("accuracy score:", accuracy_score(y_test, y_pred))


In [None]:
# Fit an SVM classifier with linear kernel
svm_classifier = SVC(C = 1, kernel = "linear", max_iter = 7000, verbose = True)
svm_classifier.fit(x_train, y_train)

In [None]:
# Get test set scores for SVM classifier
y_pred = svm_classifier.predict(x_test)
print("f1 score:", f1_score(y_test, y_pred))
print("precision score:", precision_score(y_test, y_pred))
print("recall score:", recall_score(y_test, y_pred))
print("accuracy score:", accuracy_score(y_test, y_pred))

svm_coef = svm_classifier.coef_

In [None]:
# Get top features for SVM
svm_coef = svm_coef.ravel()

top_positive_coefficients = np.argsort(svm_coef)[-20:]
top_negative_coefficients = np.argsort(svm_coef)[:20]
feature_names = np.asarray(feature_names)

top_pos_features = feature_names[top_positive_coefficients][::-1]
top_pos_weights = svm_coef[top_positive_coefficients][::-1]
top_neg_features = feature_names[top_negative_coefficients]
top_neg_weights = svm_coef[top_negative_coefficients]


In [None]:
# Visualize top positive features for SVM
col = ['blue'] * len(top_neg_weights)
imp = [abs(x) for x in top_pos_weights]

plt.figure(figsize=(6,4))
plt.barh(range(len(top_pos_features)), imp, color = col, align='center')
plt.yticks(range(len(top_pos_features)), top_pos_features)
plt.gca().invert_yaxis()
plt.ylabel("Feature Name")
plt.xlabel("Feature Weight")
plt.title("Feature Importance for Positive Comments")
# plt.savefig("pos_feature_importance")
plt.show()


In [None]:
# Visualize top negative features for SVM
col = ['red'] * len(top_neg_weights)
imp = [x for x in top_neg_weights]

plt.figure(figsize=(6,4))
plt.barh(range(len(top_neg_features)), imp, color = col, align='center')
plt.yticks(range(len(top_neg_features)), top_neg_features)
plt.gca().invert_yaxis()
plt.ylabel("Feature Name")
plt.xlabel("Feature Weight")
plt.title("Feature Importance for Negative Comments")
# plt.savefig("neg_feature_importance")
plt.show()


In [None]:
# Fit and evaluate a Logistic Regression model
lr_model = LogisticRegression(max_iter=2000, verbose=1, solver="liblinear")
lr_model.fit(x_train, y_train)

y_pred = lr_model.predict(x_test)
print("f1 score:", f1_score(y_test, y_pred))
print("precision score:", precision_score(y_test, y_pred))
print("recall score:", recall_score(y_test, y_pred))
print("accuracy score:", accuracy_score(y_test, y_pred))

In [None]:
# Fit and evaluate a Random Forest model
rf_classifier = RandomForestClassifier(300)
rf_classifier.fit(x_train, y_train)

y_pred = rf_classifier.predict(x_test)
print("f1 score:", f1_score(y_test, y_pred))
print("precision score:", precision_score(y_test, y_pred))
print("recall score:", recall_score(y_test, y_pred))
print("accuracy score:", accuracy_score(y_test, y_pred))


In [None]:
# Generate top features for Random Forest model
rf_permutation_importance = permutation_importance(rf_classifier, x_test, y_test)

sorted_indices = rf_permutation_importance.importances_mean.argsort()
plt.barh(feature_names[sorted_indices], perm_importance.importances_mean[sorted_indices])
plt.xlabel("Permutation Importance")
plt.ylabel("Feature Name")
plt.title("Feature Importance for Random Forest")
plt.show()