In [None]:
'''
COMP3800 Project 3 Ezra Jones
'''

import numpy as np
import nltk as nltk
#nltk.download('stopwords')
import pandas as pd
import sklearn as sk
import re as re
import matplotlib.pyplot as plt
import wordcloud as wc
import spacy
import seaborn as sns
import csv


SEED = 3800
np.random.seed(SEED)
sk.utils.check_random_state(SEED)
pd.options.mode.chained_assignment = None

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer #Porter2 Stemmer
from textblob import TextBlob
from datetime import datetime
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neural_network import MLPRegressor


__author__ = 'Ezra Jones'
__version__ = 'Fall 2024'

In [None]:
# Load in tweet data from HW6 with type column included

df = pd.read_csv('data/comp3800f24_tweets.txt', dtype={'id': str})
df.head()

In [None]:
# Accurate shape

print(df['id'].nunique())
print(df['id'].count())
print(df.shape)

In [None]:
# Drop misaligned rows and keep only desired columns (remove type).

df = df.loc[df['type'] == 'tweet']
df.drop('type', axis=1, inplace=True)
print(df.shape)

In [None]:
# Get dataset of all tweets, dataset of tweets with high polarities, and dataset of tweets with low polarities.

twitter = df;
tweets = []
positive_tweets = []
negative_tweets = []
for tweet in twitter['text']:
    tweets.append(tweet)
    if (TextBlob(tweet).sentiment.polarity > 0.5):
        positive_tweets.append(tweet)
    if (TextBlob(tweet).sentiment.polarity < -0.5):
        negative_tweets.append(tweet)
text = ' '.join(tweets)
negative_text = ' '.join(negative_tweets)
positive_text = ' '.join(positive_tweets)
f'one big text:\n{text}';

In [None]:
# Tokenize tweets

tokens = nltk.word_tokenize(text, language='english', preserve_line=True)
negative_tokens = nltk.word_tokenize(negative_text, language='english', preserve_line=True)
positive_tokens = nltk.word_tokenize(positive_text, language='english', preserve_line=True)

In [None]:
# Remove special characters

words = [re.sub(r'[^A-Za-z0-9]+', '', token) for token in tokens]
words = [word for word in words if word]
negative_words = [re.sub(r'[^A-Za-z0-9]+', '', token) for token in negative_tokens]
negative_words = [word for word in negative_words if word]
positive_words = [re.sub(r'[^A-Za-z0-9]+', '', token) for token in positive_tokens]
positive_words = [word for word in positive_words if word]

In [None]:
# Remove stop words and convert words to lower

stops = set(stopwords.words('english')) 
terms = [word for word in words if word.lower() not in stops]
negative_terms = [word for word in negative_words if word.lower() not in stops]
positive_terms = [word for word in positive_words if word.lower() not in stops]

In [None]:
# Get stems, (removing https).

stemmer = SnowballStemmer(language='english')
stems = [stemmer.stem(word) for word in terms if word != 'https']
negative_stems = [stemmer.stem(word) for word in negative_terms if word != 'https']
positive_stems = [stemmer.stem(word) for word in positive_terms if word != 'https']
stem_freq_dist = nltk.FreqDist(stems)
df_words = pd.DataFrame(list(stem_freq_dist.items()), columns = ['stem','freq'])
df_sorted = df_words.sort_values(by=['freq'], ascending=False)

df_sorted = df_sorted[df_sorted['stem'] != 'https']

# Bar chart visualizing the frequency of the top 20 most used words/stems.

freq = df_sorted.head(20)
plt.bar(freq["stem"], freq["freq"])
plt.xlabel('Stem')
plt.ylabel('Frequency')
plt.title('Frequency Distribution')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Word cloud showing most frequently used word stems in top negative tweets

negative_stem_freq_dist = nltk.FreqDist(negative_stems)
wordcloud = wc.WordCloud(width=1920, height=1080).generate_from_frequencies(dict(negative_stem_freq_dist))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Word cloud showing most frequently used word stems in top positive tweets

positive_stem_freq_dist = nltk.FreqDist(positive_stems)
wordcloud = wc.WordCloud(width=1920, height=1080).generate_from_frequencies(dict(positive_stem_freq_dist))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Sentiments of the most retweeted tweets (25000 retweets or greater)

popular_df = twitter[pd.to_numeric(twitter['retweetCount'], errors='coerce') > 25000]
popular_tweets = []
for tweet in popular_df['text']:
    popular_tweets.append(tweet)

sentiments = [TextBlob(tweet).sentiment for tweet in popular_tweets]
polarities = [sentiment.polarity for sentiment in sentiments]
subjectivities = [sentiment.subjectivity for sentiment in sentiments]

plt.figure(figsize=(12, 6))
colors = ['red' if polarity > 0 else 'blue' for polarity in polarities]
sizes = [1000 * abs(polarity) for polarity in polarities]
plt.scatter(polarities, subjectivities, c=colors, s=sizes, alpha=0.5)

for i, txt in enumerate(popular_df['retweetCount']):
    plt.annotate(txt, (polarities[i], subjectivities[i]), fontsize=8, alpha=0.7)

plt.title('Sentiment Analysis of Popular Tweets (25K+ retweets)')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.grid(True)
plt.show()
print(f'Average Polarity: {np.mean(polarities)}')

In [None]:
# Prep dataset for polarity time function, dropping tweets that don't contain content relevant to "ai".

df['createdAt'] = pd.to_datetime(df['createdAt'], errors='coerce')
df['Date'] = df['createdAt'].dt.date
df = df.loc[df['Date'] >= pd.to_datetime('2020-01-01').date()]
df['text'] = df['text'].str.lower()
df = df.loc[df['text'].str.contains('ai')]

popular_tweets = []
for tweet in df['text']:
    popular_tweets.append(tweet)

sentiments = [TextBlob(tweet).sentiment for tweet in popular_tweets]
polarities = [sentiment.polarity for sentiment in sentiments]

df['Polarity'] = polarities

In [None]:
# Plot figure for time function. Showing sentiment before, during, and after buyout and terms of service change dates.

buyout_day = '2022-4-14'
ai_train_day = '2024-10-15'
before_buyout_sentiment = df.loc[df['Date'] < pd.to_datetime(buyout_day).date(), 'Polarity'].mean()
before_buyout_count = df.loc[df['Date'] < pd.to_datetime(buyout_day).date()]['id'].count()
before_buyout_message = f'Before Buyout: {before_buyout_sentiment:.2f} ({before_buyout_count}) tweets'

after_buyout_sentiment = df.loc[(df['Date'] >= pd.to_datetime(buyout_day).date()) & (df['Date'] < pd.to_datetime(ai_train_day).date()), 'Polarity'].mean()
after_buyout_count = df.loc[(df['Date'] >= pd.to_datetime(buyout_day).date()) & (df['Date'] < pd.to_datetime(ai_train_day).date())]['id'].count()
after_buyout_message = f'After Buyout, Before Announcement: {after_buyout_sentiment:.2f} ({after_buyout_count}) tweets'

after_announcement_sentiment = df.loc[df['Date'] >= pd.to_datetime(ai_train_day).date(), 'Polarity'].mean()
after_announcement_count = df.loc[df['Date'] >= pd.to_datetime(ai_train_day).date()]['id'].count()
after_announcement_message = f'After Announcement: {after_announcement_sentiment:.2f} ({after_announcement_count}) tweets'

plt.figure(figsize=(15,3))
sns.lineplot(x='Date', y='Polarity', data=df)
plt.text(pd.to_datetime('2020-01-01').date(),1.1,before_buyout_message,rotation=0, horizontalalignment='center')
grok_date = pd.to_datetime(ai_train_day)
plt.axvline(x=grok_date, color='red', linestyle='--')
plt.text(grok_date,1.1,after_announcement_message,rotation=0, horizontalalignment='center')
X_date = pd.to_datetime(buyout_day)
plt.axvline(x=X_date, color='red', linestyle='--')
plt.text(X_date,1.1,after_buyout_message,rotation=0, horizontalalignment='center')
plt.title('Sentiment over Time', y=1.1)
plt.ylabel('Polarity Score', fontsize=12)
plt.xlabel('Date', fontsize=12)
plt.ylim(-1, 1)
plt.show()

In [None]:
# Create new dataframe from main twitter df, keeping text column for word2vec vectorization. Perform word2vec vectorization on
# text column, and reduce dimensionality with PCA. Add PCA columns to new dataframe and define sentiments for all tweets.

w2v_df = twitter[['text']]
w2v_df['tokenized_text'] = w2v_df['text'].apply(lambda x: x.lower().split())
corpus = w2v_df['tokenized_text'].tolist()

model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, sg=0)

def get_vector(text):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)  # Average word vectors
    else:
        return [0] * model.vector_size  # Return a vector of zeros if no word is found

w2v_df['word2vec'] = w2v_df['tokenized_text'].apply(get_vector)

pca = PCA(n_components=10)
pca_result = pca.fit_transform(w2v_df['word2vec'].tolist())
w2v_df['pca_1'] = pca_result[:, 0]
w2v_df['pca_2'] = pca_result[:, 1]
w2v_df['pca_3'] = pca_result[:, 2]
w2v_df['pca_4'] = pca_result[:, 3]
w2v_df['pca_5'] = pca_result[:, 4]
w2v_df['pca_6'] = pca_result[:, 5]
w2v_df['pca_7'] = pca_result[:, 6]
w2v_df['pca_8'] = pca_result[:, 7]
w2v_df['pca_9'] = pca_result[:, 8]
w2v_df['pca_10'] = pca_result[:, 9]
w2v_df.drop('text', axis=1, inplace=True)
w2v_df.drop('tokenized_text', axis=1, inplace=True)
w2v_df.drop('word2vec', axis=1, inplace=True)

sentiments = [TextBlob(tweet).sentiment for tweet in tweets]
polarities = [sentiment.polarity for sentiment in sentiments]
subjectivities = [sentiment.subjectivity for sentiment in sentiments]

In [None]:
# Add columns to new df containing values for each tweet's subjectivity and polarity. Define Subjectivity as the target feature for the model,
# and assign all pca components and tweet polarity as input. Create train-test split.

w2v_df['Subjectivity'] = subjectivities
w2v_df['Polarity'] = polarities

target_label = 'Subjectivity'
feature1 = 'pca_1'
feature2 = 'pca_2'
feature3 = 'pca_3'
feature4 = 'pca_4'
feature5 = 'pca_5'
feature6 = 'pca_6'
feature7 = 'pca_7'
feature8 = 'pca_8'
feature9 = 'pca_9'
feature10 = 'pca_10'
feature11 = 'Polarity'
features = [feature1, feature2, feature3, feature4, feature5, feature6, feature7, feature8, feature9, feature10, feature11]


X = w2v_df[features]
y = w2v_df[target_label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [None]:
# Train the model, print evaluation metrics, and visualize a graph detailing performance by comparing predicted values to 
# actual values. Model appears to be underfitting somewhat, unable to capture the relationship between inputs for some tweets.
# However, even in this state, the model is performing moderately well, with an R2 score of 0.54~. The graph loosley shows
# that as the actual value increases, the majority of the predicted values increase as well.

model = MLPRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

df_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Plot actual vs. predicted values
df_results.plot(kind='scatter', x='Actual', y='Predicted')
plt.title('Actual vs. Predicted Values')
plt.show()

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')