# DID REDDITORS INFLUENCE CREDIT SUISSE COLLAPSE?
This research aims to determine if the Reddit financial community had had any power in the decline of Credit Suisse's stocks price. It has been conducted as a sentiment analysis of Reddit's posts, using Python, NLP and machine learning techniques

## Environment set up

In [None]:
## Import packages

!pip install emoji arch

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import re
import os
import emoji
import nltk
import arch

from PIL import Image
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder
from nltk.collocations import BigramAssocMeasures, TrigramAssocMeasures, QuadgramAssocMeasures
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud, ImageColorGenerator
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import RFE
from sklearn.neighbors import KNeighborsClassifier
from gensim.models import Word2Vec
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
from matplotlib.patches import Ellipse
from matplotlib.patches import Rectangle
from sklearn.inspection import DecisionBoundaryDisplay
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

nltk.download(['punkt', 'stopwords', 'vader_lexicon', 'averaged_perceptron_tagger', 'wordnet'], quiet=True)



True

In [None]:
## Matplotlibe style for figures

plt.rcdefaults()
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.spines.top'] = False  # Hide top spine
plt.rcParams['axes.spines.right'] = False  # Hide right spine
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=['#003662'])
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['ytick.direction'] = 'in'  # Tick marks inside the plot
plt.rcParams['lines.color'] = 'k'
plt.rcParams['grid.color'] = '0.8'  # Light gray grid lines
plt.rcParams['grid.linestyle'] = '--'  # Dashed grid lines
plt.rcParams['grid.linewidth'] = 0.5  # Adjust grid line width
plt.rcParams['grid.alpha'] = 0.7  # Set grid line transparency

In [None]:
def StandScal(df):
    for col in df.columns:
        if df[col].dtype != 'O':
            mean = df[col].dropna().mean()
            std = df[col].dropna().std()
            for i in df.index:
                if not np.isnan(df.loc[i, col]):
                    df.loc[i, col] = (df.loc[i, col]-mean)/std
                else:
                    continue
    return df

## 1. Data Processing

### Removal of unrelevant posts

In [None]:
## Removing unrelevant post

# Import the dataset
df = pd.read_csv('Rawdata.csv', index_col = 0)
display(df.head())

# Join title and text
df.fillna('', inplace=True)
df['text'] = df['title'] + ' ' + df['content']
df.drop(['title', 'content'], axis=1, inplace=True)
df['karma'] += 1

# Number of words in posts
df['length'] = 0
for i in df.index:
    df.loc[i, 'length'] = len(df.loc[i, 'text'].split())

# Filter for period: 01/08/2022 - 20/03/2023
df['date'] = pd.to_datetime(df['date'])
df = df[df['date'] > pd.to_datetime('2022-08-01')].copy()
df = df[df['date'] < pd.to_datetime('2023-03-20')].copy()
df.reset_index(drop=True, inplace=True)

# Remove stock news post
forbidden_words = ['CLICK HERE', 'Downgrades:', 'volume leaders', 'Q4 2022 Letters & Reports', 'Holdings Inc']
to_save = ['11ushdl', 'zt4t1y', 'yckj9o', 'ybqjek']
to_drop = []
for i in df.index:
    for forbidden in forbidden_words:
        if forbidden in df.loc[i, 'text']:
            if df.loc[i, 'post_id'] not in to_save:
                to_drop.append(i)

df.drop(to_drop, inplace=True)
df.reset_index(drop=True, inplace=True)

Unnamed: 0,title,post_id,date,karma,upvote,content,subreddit
0,I thought of you guys when I listened to this ...,ek5jql,2020-01-05 00:58:56,0,0.5,Bloomberg View columnist Barry Ritholtz interv...,algotrading
1,Historical analyst target prices,gtz5kk,2020-05-31 13:36:45,0,0.33,Analysts' stock reports from investment banks ...,algotrading
2,The Swiss government just borrowed $9 billion ...,y83ebs,2022-10-19 13:55:33,3431,0.95,,amcstock
3,"REALITY CHECK: Peter Hahn, one of the major fi...",xuht8x,2022-10-03 12:15:15,1957,0.89,I think it's important for us to manage expect...,amcstock
4,*** Ultimate AMC Timeline (Updated April 25) ***,mygdc3,2021-04-25 20:05:47,2756,0.99,# [CLICK HERE to go to the April 28 update.](h...,amcstock


### Sentiment Extraction

In [None]:
## Sentiment extraction

# Emoticon extraction
df['emojis'] = ''
for i in df.index:
    df.loc[i, 'emojis'] = ' '.join([emoji.demojize(c) for c in df.loc[i, 'text'] if c in emoji.EMOJI_DATA])
emoj = pd.read_excel('Emojissent.xlsx')
emoj['Description'] = [emoji.demojize(c) for c in emoj['Symbol']]
emoj.drop('Symbol', axis=1, inplace=True)
emoj.set_index('Description', inplace=True)
new_words=emoj['Sentiment'].to_dict()

# Text cleaning
for i in df.index:
    text = df.loc[i, 'text']
    whole = []
    for element in sent_tokenize(text):
        clean = element.lower()
        clean = re.sub(r'\bcs\b', 'credit suisse', clean)
        clean = re.sub(r'\bdd\b', 'double down', clean)
        clean = re.sub(r'\bcds\b', 'default swap', clean)
        clean = re.sub(r'u\.s\.', 'usa', clean)
        clean = re.sub(r'&#[A-Za-z0-9]+', '', clean)
        clean = re.sub(r'tl(;|)dr', '', clean)
        clean = re.sub(r'\([^()]*\)', '', clean)
        clean = re.sub(r'\[[^\[\]]*\]', '', clean)
        clean = re.sub(r"(?:\@|http?|https?|www)\S+", "", clean)
        clean = re.sub(r"don\'t", 'do not', clean)
        clean = re.sub(r"doesn\'t", 'does not', clean)
        clean = re.sub(r'\’', "'", clean)
        clean = re.sub(r"[^A-Za-z\'\?\!]", ' ', clean)
        clean = re.sub(r'\b\w\b', '', clean)
        clean = re.sub(r'\bcredit\b', '', clean)
        wo_spaces = clean.split()
        clean = ' '.join(wo_spaces)
        whole.append(clean)
    clean = ' '.join(whole)
    df.loc[i, 'text'] = clean

# Computing sentiment scores
sia = SentimentIntensityAnalyzer()
sia.lexicon.update(new_words)
sentiment_scores = [sia.polarity_scores(doc)['compound'] for doc in df['text'] + ' ' + df['emojis']]
df['sentiment'] = sentiment_scores
df.to_csv('Processeddata.csv')

### Exploratory analysis: figures

In [None]:
## Exploratory analysis: length, subreddit, sentiment distribution

# Lenght of the post
perc = np.arange(10, 100, 10)
quant = np.percentile(df['length'], perc)
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(np.arange(1,10), quant)
ax.set_title("Percentile distribution of the posts' length")
ax.set_xticks(np.arange(1,10), labels=[str(i)+'%' for i in perc])
ax.set_axisbelow(True)
ax.set_xlabel('Percentile')
ax.set_ylabel('Frequency')
ax.grid()
for p in range(len(perc)):
  ax.text(np.arange(1,10)[p], quant[p]+4, int(quant[p]),
             fontsize=9, horizontalalignment='center')
mean_max_box = f'Mean: {int(df["length"].mean())}\n\nMax: {int(df["length"].max())}'
ax.text(1.7, 230, mean_max_box, fontsize=12, color='black', bbox=dict(facecolor=(0.95, 0.95, 0.95), edgecolor='black'))
fig.savefig("Post length", dpi=300)
plt.close()

# Top 10 Subreddit
subr = df[['subreddit', 'karma']].groupby('subreddit').count().sort_values('karma', ascending=False).head(10).sort_index()
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(np.arange(len(subr)), subr['karma'])
ax.set_title('Posts downloaded for the 10 major subreddits')
ax.set_xticks(np.arange(len(subr)), subr.index, rotation=45)
ax.set_ylabel('Number of posts')
fig.savefig("Subreddit", dpi=300, bbox_inches='tight')
plt.close()

# Frequency distribution of sentiment
df = pd.read_csv('Processeddata.csv', index_col=0)
df['periodo'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m')
periodo = df[['periodo', 'sentiment']].groupby('periodo').mean()
df['pos'] = df['sentiment'] > 0
df['neg'] = df['sentiment'] < 0
df['neu'] = df['sentiment'] == 0
aggr_sent = df[['periodo', 'sentiment']].groupby('periodo').mean()
aggr_pol = df[['periodo', 'pos', 'neg', 'neu']].groupby('periodo').sum()
aggr_pol['tot'] = aggr_pol.sum(axis=1)
aggr_pol = aggr_pol/np.repeat(aggr_pol['tot'].values, 4).reshape(8,4)

fig, ax = plt.subplots(figsize=(10,6))
weights = np.ones_like(sentiment_scores) / len(sentiment_scores)
perc = np.arange(0, 0.301, 0.05)
ax.hist(sentiment_scores, weights=weights, bins=11, edgecolor='white')
ax.set_title('Density distribution of \nsentiment scores')
ax.set_yticks(perc)
ax.set_yticklabels([str(int(i))+'%' for i in perc*100])
ax.set_ylabel('Relative frequency')
ax.set_xlabel('Sentiment scores')
fig.savefig("Sentiment distribution", dpi=300)
plt.close()

# Monthly distribution of sentiment
fig, ax = plt.subplots(figsize=(10,6))
ax.bar(np.arange(8), height=aggr_pol['pos'])
ax.bar(np.arange(8), height=aggr_pol['neg'], bottom=aggr_pol['pos'], color='#FF4500')
ax.bar(np.arange(8), height=aggr_pol['neu'], bottom=aggr_pol['pos']+aggr_pol['neg'], color='#B2BEB5')
colors = ['white', 'black', 'black']
polarity = [aggr_pol['pos'], aggr_pol['neg'], aggr_pol['neu']]
for i in np.arange(8):
  ax.text(i, polarity[0][i]/2,
          np.round(polarity[0][i],2), horizontalalignment='center', color='white')
for i, p in enumerate(aggr_pol['neg']):
  ax.text(i, polarity[1][i]/2 + polarity[0][i],
          np.round(polarity[1][i],2), horizontalalignment='center')
for i, p in enumerate(aggr_pol['neu']):
  ax.text(i, polarity[2][i]/2 + polarity[1][i] + polarity[0][i] - 0.01,
          np.round(polarity[2][i], 2), horizontalalignment='center')
for i, sentmean in enumerate(periodo['sentiment']):
  ax.text(i, 1.11, np.round(sentmean, 2), horizontalalignment='center', fontsize=10)
ax.set_title('Percentage sentiment score per month')
ax.set_xticks(np.arange(8))
ax.set_xticklabels(periodo.index, rotation=45)
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.13])
ax.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1.0,'Mean of \nsentiment'])
ax.set_xlabel('Time')
ax.set_ylabel('Sentiment scores (%)')
ax.legend(['Positive', 'Negative', 'Neutral'], loc='center right', reverse=True, fontsize=10)
ax.plot([-1, 7.5], [1.06, 1.06], color='black')
ax.set_xlim([-0.8,9.5])
ax.set_ylim([0, 1.2])
fig.savefig("Monthly sentiment", dpi=300,  bbox_inches='tight')
plt.close()

# Stock prices graphs: nominal and return
stock = pd.read_csv('CS.csv')
stock['Date'] = pd.to_datetime(stock['Date']).dt.date
stock['return'] = stock['Price'].pct_change()
stock.dropna(inplace=True)
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(stock['Date'], stock['Price'])
ax.set_xlim([pd.Timestamp(2022,7,25), pd.Timestamp(2023,3,30)])
ax.set_title("Credit Suisse's nominal price across time")
ax.set_ylabel('Stock price')
ax.set_xlabel('Time', labelpad=10)
fig.savefig('Nominal prices', dpi = 300)
plt.close()

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(stock['Date'], stock['return'])
ax.plot([pd.Timestamp(2022,7,25), pd.Timestamp(2023,3,15)], [0, 0], color='#FF4500', linewidth=0.5)
ax.set_xlim([pd.Timestamp(2022,7,25), pd.Timestamp(2023,3,30)])
ax.set_title('Percentage returns across time')
ax.set_xlabel('Time', labelpad=10)
ax.set_ylabel('Return')
fig.savefig('Returns', dpi = 300)
plt.close()

# ACF
fig, ax = plt.subplots(1, 2, figsize=(10,8))
legs = ax[0].acorr(stock['return'])
critical = 2/np.sqrt(len(stock['return']))
points = ax[0].scatter(legs[0], legs[1])
conf_int = ax[0].plot([-1, 11], [critical, critical], color='red')
ax[0].plot([-1, 11], [-critical, -critical], color='red')
ax[0].set_xlim((-0.5, 10.5))
ax[0].set_title('ACF for $u_t$')
legs_squared = ax[1].acorr(stock['return']**2)
ax[1].scatter(legs_squared[0], legs_squared[1])
ax[1].plot([-1, 11], [critical, critical], color='red')
ax[1].set_xlim((-0.5, 10.5))
ax[1].set_ylim(-0.25, 1.05)
ax[1].set_title('ACF for $u_t^2$')
ax[1].legend(handles=[points, conf_int[0]], labels=['autocorrelation', 'confindence interval'], loc=(-0.7, -0.2), ncols=2)
plt.savefig('Autocorrelation', dpi=300, bbox_inches='tight')
plt.close()

### NLP textual processing

In [None]:
## NLP Processing: Tokenization, stopwords removal, lemmatization
df = pd.read_csv('Processeddata.csv', index_col=0)
df.fillna('', inplace=True)

# Part Of Speech Tagging
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

# Lemmatization
lemmatizer = WordNetLemmatizer()

# Stopwords
stop_words_list = stopwords.words("english")
more_stopwords = ['credit', 'suisse', 'bank', 'stock', 'click', 'chart', 'market',
                  'link', 'tgt', 'price', 'swiss', 'amc', 'ape', 'gme', 'upgrade', 'downgrade',
                  'upgraded', 'downgraded', 'outperform', 'underperform', 'perform',
                  'overweight', 'equal', 'weight', 'underweight', 'neutral', 'jp', 'morgan',
                  'minute', 'topaz', 'meltzer', 'sc', 'basket', 'creditsuisse']
less_stopwords = ['not', 'no', 'nor', 'up', 'down', 'out']
stop_words_list.extend(more_stopwords)
untouchable = ['moass', 'usa']
for word in less_stopwords:
    stop_words_list.remove(word)
stop_words = set(stop_words_list)

# Text processing
empties = []
for i in df.index:
    clean = df.loc[i, 'text']
    clean = re.sub(r"[^A-Za-z]", ' ', clean)
    tokens = word_tokenize(clean)
    pos = pos_tag(tokens)
    filtered_post = []
    for word in pos:
        if word[0] not in untouchable:
            lemm = lemmatizer.lemmatize(word[0], pos=get_wordnet_pos(word[1]))
        else:
            lemm = word[0]
        if lemm not in stop_words:
            filtered_post.append(lemm)
    clean = ' '.join(filtered_post)
    wo_spaces = clean.split()
    clean = ' '.join(wo_spaces)
    if clean.strip() == '':
        empties.append(i)
    df.loc[i, 'text'] = clean
df.drop(empties, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv('Sentdata.csv')

### Regression figures

In [None]:
## Regression figures: LASSO geometric intepretation and Regression line

# LASSO geometric intepretation
lato = 5
square = Rectangle((0, -lato*0.5*np.sqrt(2)), lato, lato, edgecolor='#003662', angle=45, alpha=0.5)
fig, ax = plt.subplots()
ax.add_patch(square)
for i in [0.8,1.5,2.5,4]:
  ellipse = Ellipse((4, 10.4), width=2*i, height=4*i, angle=145, edgecolor='#FF4500', facecolor='none', linewidth=0.5)
  ax.add_patch(ellipse)
ax.arrow(0, -5, 0, 22, head_width=0.3, head_length=0.8, fc='black', ec='black', linewidth=0.5)
ax.arrow(-5, 0, 18, 0, head_width=0.8, head_length=0.3, fc='black', ec='black', linewidth=0.5)
ax.set_xlim(-5, 18)
ax.set_ylim(-5, 18)
ax.axis('off')
ax.scatter(4, 10.4, color='black', s=5)
ax.text(4.2, 10.2, r"$\hat{\beta}$")
ax.text(-1.5, 16, r"$β_2$")
ax.text(12, -1.5, r"$β_1$")
ax.legend(['Constraint set', 'RSS contour'], reverse=True, loc='center right')
ax.set_title('Geometric intepretation of LASSO regularisation', pad=10)
fig.savefig('LASSO', dpi=300)
plt.close()

#Linear regression
np.random.seed(3)
x = np.linspace(1, 10, 25)
y = 0.5*x + 3
e = np.random.uniform(-3, 3, 25)
y_hat = 0.5*x + 3 + e
fig, ax = plt.subplots()
ax.plot(x, y)
ax.scatter(x[1:-1], y_hat[1:-1], color='#FF4500')
for i in np.arange(1,24):
  ax.plot([x[i], x[i]], [y[i], y_hat[i]], color='black', linewidth=0.5)
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.set_title('Linear Regression with residuals')
ax.set_xlabel('x', labelpad=10)
ax.set_ylabel('y', rotation=0, labelpad=15)
ax.set_xticks(np.linspace(0, 12, 6), labels=np.round(np.linspace(0, 1, 6), 2))
ax.set_yticks(np.linspace(2, 12, 5), labels=np.round(np.linspace(0.2, 1, 5), 2))
ax.legend(['Regression line', 'Observed values', 'Residuals'], loc=(1, 0.7))
fig.savefig('Regression', dpi=300, bbox_inches='tight')
plt.close()

## 3. Analysis

### Correlation matrix

In [None]:
# Correlation across features
daily = pd.read_csv('Dailydata.csv')
daily.dropna(inplace=True)
fig, ax = plt.subplots(figsize=(10,5))
corr_matrix = daily.corr(numeric_only=True)
im = ax.imshow(corr_matrix, cmap='PuOr', interpolation='nearest')
ax.set_yticks(np.arange(corr_matrix.shape[0]), labels=corr_matrix.columns)
ax.set_xticks(np.arange(corr_matrix.shape[0]), labels=corr_matrix.columns, rotation=90)
ax.figure.colorbar(im)
ax.set_title('Correlation across the features', pad=10)
fig.savefig('Heatmap', dpi=300, bbox_inches='tight')
plt.close()

### WordClouds

In [None]:
## WordClouds for N-grams

# Load the data
df = pd.read_csv('Sentdata.csv', index_col=0)
df.fillna('', inplace=True)

# Converting collocations to single units
def tuple_keys_to_unique_strings(dictionary):
    new_dict = {}
    for key_tuple, value in dictionary.items():
        new_key = ' '.join(key_tuple)
        new_dict[new_key] = value
    return new_dict

corpus = word_tokenize(' '.join([df.loc[i, 'text'] for i in df.index]))
image_path = os.path.join("circle.png")
image = Image.open(image_path)
imask = np.array(image)
colors = [(1.0, 0.270588, 0.2), (0.0, 0.133333, 0.384314)]
cmap = LinearSegmentedColormap.from_list('custom', colors, N=2)

ngram = ['Single words', 'Bigrams', 'Trigrams', 'Quadgrams']
coll_finder = [BigramCollocationFinder, TrigramCollocationFinder, QuadgramCollocationFinder]
assoc_measure = [BigramAssocMeasures(), TrigramAssocMeasures(), QuadgramAssocMeasures()]

for i in range(4):
  if i == 0:
    wordcloud = WordCloud(background_color="white", mask=imask, random_state=1, colormap=cmap)
    wordcloud.generate(' '.join(corpus))
  else:
    finder = coll_finder[i-1].from_words(corpus)
    fdist = finder.score_ngrams(assoc_measure[i-1].raw_freq)
    wordcloud = WordCloud(background_color="white", mask=imask, random_state=1, colormap=cmap)
    wordcloud.generate_from_frequencies(tuple_keys_to_unique_strings(dict(fdist)))
  fig, ax = plt.subplots(figsize = (8,5))
  ax.set_title(f'{ngram[i]} Frequency', fontsize=15)
  ax.imshow(wordcloud, interpolation="bilinear")
  ax.set_ylim([1350, 50])
  ax.set_xlim([50, 1350])
  ax.axis('off')
  fig.savefig(ngram[i], dpi=300)
  plt.close()

### OLS: Returns vs posts data & posts vs financial data

In [None]:
# Regression returns vs posts data
daily = pd.read_csv('Dailydata.csv')
daily.dropna(inplace=True)
X = StandScal(daily[['sentiment', 'length', 'posts', 'karma', 'upvote']].copy())
X = sm.add_constant(X)
Y = daily['returns']
reg = sm.OLS(Y, X)
model = reg.fit()
print(model.summary())

# Regression posts vs financial data
X = StandScal(daily[['price', 'volume', 'volatility', 'cds', 'returns']].copy())
X = sm.add_constant(X)
Y = daily['posts']
reg = sm.OLS(Y, X)
model = reg.fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.052
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     1.399
Date:                Mon, 20 Nov 2023   Prob (F-statistic):              0.229
Time:                        22:09:39   Log-Likelihood:                 228.51
No. Observations:                 134   AIC:                            -445.0
Df Residuals:                     128   BIC:                            -427.6
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0071      0.004     -1.828      0.0

### KNN & Word2Vec: Returns vs post data

In [None]:
# Posts data
df = pd.read_csv('Sentdata.csv', index_col=0)
daily = pd.read_csv('Dailydata.csv')
daily.dropna(inplace=True)

dataset = StandScal(daily)
dataset['returns'] = dataset['returns']>=0
dataset['date'] = pd.to_datetime(dataset['date'])
train = dataset[dataset['date']<pd.to_datetime('2023-01-01')]
test = dataset[dataset['date']>=pd.to_datetime('2023-01-01')]
X_train = train[['posts', 'karma', 'upvote', 'length', 'sentiment']].values
Y_train = train['returns'].values
X_test = test[['posts', 'karma', 'upvote', 'length', 'sentiment']].values
Y_test = test['returns'].values

scores = []
for k in np.arange(3, 52, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, Y_train)
    scores.append(knn.score(X_test, Y_test))
print('K optimal:', np.arange(3, 52, 2)[np.argmax(scores)], '\nScore:', np.max(scores))

#  Word2vec embedding
corpus = [word_tokenize(df.loc[i, 'text']) for i in df.index]
w2v = Word2Vec(corpus, min_count=1, vector_size=300)

post_w2v_df = []
for i in df.index:
  post_score = 0
  for word in df.loc[i, 'text'].split():
    post_score+=w2v.wv[word]
  post_w2v_df.append(post_score)
post_w2v_df = pd.DataFrame(post_w2v_df, index=pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d'))
post_w2v_df.columns=[str(j) for j in post_w2v_df.columns]

# W2v
post_w2v_df.sort_index(inplace=True)
ts_word = post_w2v_df.groupby(post_w2v_df.index).mean()
ts_word = ts_word.loc[daily['date'].astype('str')]
ts_word['returns'] = dataset['returns'].values
ts_word['date'] = pd.to_datetime(ts_word.index)
train = ts_word[ts_word['date']<pd.to_datetime('2023-01-01')]
test = ts_word[ts_word['date']>=pd.to_datetime('2023-01-01')]
X_train = train.drop(['returns', 'date'], axis=1)
X_test = test.drop(['returns', 'date'], axis=1)
Y_train = train['returns'].values
Y_test = test['returns'].values

scores = []
for k in np.arange(3, 51, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, Y_train)
    scores.append(knn.score(X_test, Y_test))
print('K optimal:', np.arange(3, 51, 2)[np.argmax(scores)], '\nScore:', np.max(scores))

# Posts + w2v
ts_word.drop(['date', 'returns'], axis=1, inplace=True)
for c in dataset.columns:
  ts_word[c] = dataset[c].values
train = ts_word[ts_word['date']<pd.to_datetime('2023-01-01')]
test = ts_word[ts_word['date']>=pd.to_datetime('2023-01-01')]
X_train = train.drop(['returns', 'date', 'price', 'volume', 'volatility', 'cds', 'returns'], axis=1)
X_test = test.drop(['returns', 'date', 'price', 'volume', 'volatility', 'cds', 'returns'], axis=1)
Y_train = train['returns'].values
Y_test = test['returns'].values

scores = []
for k in np.arange(3, 51, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, Y_train)
    scores.append(knn.score(X_test, Y_test))
print('K optimal:', np.arange(3, 51, 2)[np.argmax(scores)], '\nScore:', np.max(scores))

K optimal: 39 
Score: 0.6666666666666666
K optimal: 9 
Score: 0.6
K optimal: 9 
Score: 0.6


In [None]:
# Posts data
df = pd.read_csv('Sentdata.csv', index_col=0)
daily = pd.read_csv('Dailydata.csv')
daily.dropna(inplace=True)
dataset = StandScal(daily)
dataset['returns'] = dataset['returns']>=0
dataset['date'] = pd.to_datetime(dataset['date'])
train = dataset[dataset['date']<pd.to_datetime('2023-01-01')]
test = dataset[dataset['date']>=pd.to_datetime('2023-01-01')]
X = dataset[['sentiment', 'posts']]
Y = dataset['returns'].values
X_train = train[['sentiment', 'posts']]
Y_train = train['returns'].values
X_test = test[['sentiment', 'posts']]
Y_test = test['returns'].values
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(X_train, Y_train)

fig, ax = plt.subplots(figsize=(10, 8))

disp = DecisionBoundaryDisplay.from_estimator(knn, X_test, response_method="predict",
                                              plot_method="pcolormesh", shading="auto", alpha=0.65, ax=ax)
scatter = disp.ax_.scatter(X.iloc[:, 0], X.iloc[:, 1], c=Y, edgecolors="k", s=50)
disp.ax_.legend(scatter.legend_elements()[0], ['Price Down', 'Price Up'], loc="upper left", title="Classes")
ax.set_ylim((-1, 3))
ax.set_title('KNN decision boundaries')
plt.savefig('KNN boundaries', dpi=300)
plt.close()

### Post traffic analysis

In [None]:
# Post traffic analysis
peaks = ['2022-10-03', '2022-10-27', '2022-11-23', '2023-03-15']
i_coord = [63, 87, 114, 226]
y_coord = [83, 47, 21, 135]
daily = pd.read_csv('Dailydata.csv')
daily['date'] = pd.to_datetime(daily['date']).dt.date
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(daily.index, daily['posts'])
ax.scatter(i_coord, y_coord, c='red', s=70)
for i,y in zip(i_coord, y_coord):
    sentiment = np.round(daily.loc[i, 'sentiment'],2)
    date = daily.loc[i, 'date']
    ax.text(i,y+3, date, fontsize=10)
ax.set_ylim(-1, 150)
ax.set_title('Peaks of online activity', fontsize=15)
ax.set_ylabel('Number of posts')
ax.set_xticks([i for i in np.arange(226) if daily.loc[i, 'date'].day == 1],
              labels=[pd.Timestamp(daily.loc[i, 'date']).month_name() for i in np.arange(226) if daily.loc[i, 'date'].day == 1])
fig.savefig("Peaks of activity", dpi=300)
plt.close()

### Pre-peak and post-peak activity

In [None]:
## Pre and post peak analysis

daily = pd.read_csv('Dailydata.csv')
daily.dropna(inplace=True)
daily['date'] = pd.to_datetime(daily['date']).dt.date
daily.sort_values('date', inplace=True)
peaks = pd.to_datetime(['2022-10-03', '2022-10-27', '2022-11-23', '2023-03-09']).date
window = 5

df_prepost = []
for i in range(2):
  df_window = []
  for p in peaks:
    if i == 0:
      df_window.append(daily[daily['date']<p].tail(window))
    else:
      df_window.append(daily[daily['date']>=p].head(window))
  df_window = pd.concat(df_window, ignore_index=True)
  coef_window = np.round(np.corrcoef(df_window['sentiment'], df_window['returns'])[0,1], 2)
  desc_window = [df_window['sentiment'].mean(), df_window['returns'].mean(),
                 df_window['sentiment'].std(), df_window['returns'].std(),
                 coef_window]
  df_prepost.append(desc_window)

  X = StandScal(df_window[['sentiment', 'posts', 'karma', 'upvote']].copy())
  X = sm.add_constant(X)
  Y = df_window['returns']
  reg = sm.OLS(Y, X)
  model = reg.fit()
  #print(model.summary())

desc_df = pd.DataFrame([df_prepost[0], df_prepost[1]], index = ['Pre-peak', 'Post-peak'],
              columns=['Average sentiment', 'Average return', 'Std sentiment', 'std returns', 'Correlation sentiment-returns'])
display(desc_df)

Unnamed: 0,Average sentiment,Average return,Std sentiment,std returns,Correlation sentiment-returns
Pre-peak,0.171874,-0.00522,0.412206,0.028259,0.29
Post-peak,-0.001396,-0.020031,0.172191,0.07601,0.07


### Short selling

In [None]:
## Short selling
bim = pd.read_csv('Shortdata.csv')
bim = StandScal(bim)
X = StandScal(bim[['karma', 'upvote', 'length', 'sentiment', 'posts', 'price']].copy())
X = sm.add_constant(X)
Y = bim['shares']
reg = sm.OLS(Y, X)
model = reg.fit()
#print(model.summary())

### Embedding + Lasso

In [None]:
## Embedding representation + Lasso Regression for Variable Selection

# Colors for wordclouds
def get_color(value):
    return '#003662' if value >= 0 else '#FF4500'

# Load the dataset
daily = pd.read_csv('Dailydata.csv')
daily.dropna(inplace=True)
daily = StandScal(daily)

# Daily Aggregation + WordCloud
def LASSOed(wordembed, figname, min_lambda, max_lambda):
  wordembed.sort_index(inplace=True)
  ts_word = wordembed.groupby(wordembed.index).sum()
  ts_word = ts_word.loc[daily['date'].astype('str')]
  ts_word['CS_returns'] = daily['returns'].values
  ts_word['CS_date'] = pd.to_datetime(ts_word.index)
  train = ts_word[ts_word['CS_date']<pd.to_datetime('2023-01-01')]
  test = ts_word[ts_word['CS_date']>=pd.to_datetime('2023-01-01')]
  X_train = train.drop(['CS_returns', 'CS_date'], axis=1)
  X_test = test.drop(['CS_returns', 'CS_date'], axis=1)
  Y_train = train['CS_returns'].values
  Y_test = test['CS_returns'].values
  scores = []
  lambda_array = np.linspace(min_lambda, max_lambda, 25)
  for l in lambda_array:
      reg = Lasso(alpha=l, max_iter=100000)
      reg.fit(X_train, Y_train)
      scores.append(reg.score(X_test, Y_test))
  best_l = lambda_array[np.argmax(scores)]
  print('Lasso optimal:', best_l, '\nScore:', np.max(scores))
  reg = Lasso(alpha=best_l, max_iter=100000)
  reg.fit(X_train, Y_train)
  survivors = pd.DataFrame(reg.coef_, wordembed.columns, columns=['coef'])
  coeffic = survivors[survivors['coef']!=0]['coef']
  word_colors = {word: get_color(value) for word, value in zip(coeffic.index, coeffic)}
  fig, ax = plt.subplots(figsize = (8,5))
  wordcloud = WordCloud(background_color="white", mask=imask, random_state=1,
                        color_func=lambda word, *args, **kwargs: word_colors[word])
  wordcloud.generate_from_frequencies(dict(coeffic**2))
  ax.imshow(wordcloud, interpolation="bilinear")
  ax.set_title(f'{figname}: LASSO coefficients', fontsize=15)
  ax.axis('off')
  legend_elements = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#003662', label='Positive Coefficient'),
                    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#FF4500', label='Negative Coefficient')]
  ax.legend(handles=legend_elements, loc =(0.8,0.85), fontsize='x-small')
  fig.savefig(figname, dpi=300, bbox_inches='tight')
  plt.close()

In [None]:
## Bag-of-word

# Load the data
df = pd.read_csv('Sentdata.csv', index_col=0)

vec_name = ['Bag-of-word', 'tf-idf']
vectorizers = [CountVectorizer(), TfidfVectorizer()]
corpus = [df.loc[i, 'text'] for i in df.index]
min_lambda_array = [0.05, 0.001]
max_lambda_array = [0.15, 0.01]
penalties = [50, 5]
for v in range(2):
  vectorizer = vectorizers[v]
  encoded = vectorizer.fit_transform(corpus)
  features = vectorizer.get_feature_names_out()
  onehot = pd.DataFrame(encoded.toarray(), columns=features,
                            index=pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d'))
  onehot.drop(onehot.columns[onehot.sum()<penalties[v]], axis=1, inplace=True)
  LASSOed(onehot, vec_name[v], min_lambda_array[v], max_lambda_array[v])

Lasso optimal: 0.15 
Score: -1.4538223124226524
Lasso optimal: 0.01 
Score: -0.8063408660959512


### GARCH

In [None]:
stock = pd.read_csv('CS.csv')
stock['Date'] = pd.to_datetime(stock['Date']).dt.date
stock['returns'] = stock['Price'].pct_change()*100
stock['returns'] = stock['returns']
stock.dropna(inplace=True)

np.random.seed(0)
model = arch.arch_model(stock['returns'], vol='Arch', p=3)
results = model.fit(update_freq=0, disp='off')
print(results.summary())

                      Constant Mean - ARCH Model Results                      
Dep. Variable:                returns   R-squared:                       0.000
Mean Model:             Constant Mean   Adj. R-squared:                  0.000
Vol Model:                       ARCH   Log-Likelihood:               -453.930
Distribution:                  Normal   AIC:                           917.860
Method:            Maximum Likelihood   BIC:                           933.173
                                        No. Observations:                  158
Date:                Mon, Nov 20 2023   Df Residuals:                      157
Time:                        22:09:53   Df Model:                            1
                                Mean Model                                
                 coef    std err          t      P>|t|    95.0% Conf. Int.
--------------------------------------------------------------------------
mu            -0.5866      0.337     -1.743  8.136e-02 [ -1.246,

### Backward selection

In [None]:
df = pd.read_csv('Dailydata.csv', index_col=0)
df.dropna(inplace=True)
X = StandScal(df[['sentiment', 'upvote', 'posts', 'length', 'karma']].copy())
Y = df['returns']
reg = LinearRegression()
n = X.shape[0]

reg.fit(X, Y)
p = reg.n_features_in_
print(reg.feature_names_in_, 1-((1-reg.score(X, Y))*((n-1)/(n-p-2))))

reg = LinearRegression()
for i in range(4):
  sfs = RFE(reg, n_features_to_select=4-i)
  sfs.fit(X, Y)
  p = sfs.n_features_
  print(sfs.get_feature_names_out(), 1-((1-sfs.score(X, Y))*((n-1)/(n-p-2))))


['sentiment' 'upvote' 'posts' 'length' 'karma'] 0.007022778225361459
['sentiment' 'posts' 'length' 'karma'] 0.014413661074284989
['sentiment' 'posts' 'karma'] 0.01877223273493589
['sentiment' 'posts'] 0.024637781137960224
['posts'] 0.02084657906736853
