In [None]:
import pandas as pd
import numpy as np 
import matplotlib
import seaborn as sns
import plotly.express as px
import text2emotion as te
import matplotlib.pyplot as plt
%matplotlib inline 
from wordcloud import ImageColorGenerator, WordCloud, STOPWORDS
from textblob import TextBlob
from IPython.display import clear_output
from time import sleep
import missingno
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
nltk.download('vader_lexicon')
import re
import string

In [None]:
data = pd.read_csv('reddit_wsb.csv')
data.tail()

In [None]:
data.info()

In [None]:
missingno.matrix(data)

In [None]:
def wordcloud(data, title=""):
    text = " ".join(t for t in data.dropna())
    stopwords = set(STOPWORDS)
    stopwords.update(["t", "co", "https","U", "fuck", "amp", "fucking", "dope", "guy"])
    wordcloud = WordCloud(stopwords=stopwords, scale=5, max_font_size=55, max_words=550,background_color="white").generate(text)
    fig = plt.figure(1, figsize=(18,18))
    plt.axis('off')
    fig.suptitle(title, fontsize=20)
    fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

In [None]:
wordcloud(data['title'], title ="Most used words in titles")
wordcloud(data['body'], title ="Most used words in body")

In [None]:
#Uncover emotions
#data['OverallText'] = data.title + " " + data.body.astype(str)
#emotions = []
#from tqdm import tqdm_notebook as tqdm
#tqdm().pandas()
#for text in tqdm(data.OverallText):
    #emotions.append(te.get_emotion(str(text)))

In [None]:
emotions_data = pd.DataFrame.from_dict(emotions)
data['happy'] = emotions_data.Happy
data['angry'] = emotions_data.Angry
data['sad'] = emotions_data.Sad
data['fear'] = emotions_data.Fear
data['surprise'] = emotions_data.Surprise

data.head()

In [None]:
emotions_data.head()

In [None]:
dominant_emotion = []

for emotion in emotions_data.idxmax(axis=1):
    dominant_emotion.append(emotion)
    
data['dominant'] = dominant_emotion
data.head()

In [None]:
data.to_csv('WSB_Sentiment_Analysis', index=False)

In [None]:
data = pd.read_csv('WSB_Sentiment_Analysis')

In [None]:
data.tail()

In [None]:
ax = sns.violinplot(x=data['dominant'], y=data.index, split=True, data=data)
ax.set_title('Dominant Emotions per Post')

In [None]:
#Set up the time
day_name= ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']
data['timestamp']=pd.to_datetime(data['timestamp'])
data['date']=data['timestamp'].dt.day
data['weekday']=data['timestamp'].dt.weekday
data['weekday']=data['weekday'].apply(lambda x: day_name[x])
data['hour']=data['timestamp'].dt.hour

In [None]:
data

In [None]:
px.histogram(data,x='weekday',color='weekday')

In [None]:
#data["date"] = pd.to_datetime(data.timestamp).dt.date
by_hour = data.groupby(["date", "hour"]).mean()

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
fig = by_hour.reset_index().plot(y = "happy", labels={
                     "happy": "Score ratio of each emotion", 
                     "index": "Hours since 9:00 AM, 1/28/2021"
                 },
                title="Emotions expressed by hour")
fig.add_scatter(y=by_hour['happy'], mode='lines', name = "Happy")
fig.add_scatter(y=by_hour['sad'], mode='lines', name = "Sad")
fig.add_scatter(y=by_hour['angry'], mode='lines', name = "Angry")
fig.add_scatter(y=by_hour['surprise'], mode='lines', name = "Surprise")
fig.add_scatter(y=by_hour['fear'], mode='lines', name = "Fear")
fig.show()

In [None]:
df1 = data[data['dominant'].str.contains("Fear")]

wordcloud(df1['title'], title= 'Most used fear words in title')
wordcloud(df1['body'], title= 'Most used fear words in title')

In [None]:
title_data = data[['title','timestamp']].copy()
body_data = data[['body','timestamp']].copy()

body_data = body_data.dropna()
title_data = title_data.dropna()

title_data.title = title_data.title.str.lower()
body_data.body = body_data.body.str.lower()

title_data

In [None]:
#Remove URLS
title_data.title = title_data.title.apply(lambda x:re.sub(r"http\S+", "", x))
body_data.body   = body_data.body.apply(lambda x:re.sub(r"http\S+", "", x))

#Remove all single characters
title_data.title = title_data.title.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
body_data.body   = body_data.body.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

#Replace multiple spaces with a single one
title_data.title = title_data.title.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
body_data.body   = body_data.body.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

# Remove all the special characters
title_data.title = title_data.title.apply(lambda x:' '.join(re.findall(r'\w+', x)))
body_data.body   = body_data.body.apply(lambda x:' '.join(re.findall(r'\w+', x)))

#Remove handles
title_data.title = title_data.title.apply(lambda x:re.sub('@[^\s]+','',x))
body_data.body   = body_data.body.apply(lambda x:re.sub('@[^\s]+','',x))

#Remove Time From Timestamp
title_data.timestamp = pd.to_datetime(title_data.timestamp).dt.date
body_data.timestamp = pd.to_datetime(body_data.timestamp).dt.date

In [None]:
print(body_data.shape)
print(title_data.shape)

In [None]:
title_data

In [None]:
results = []

def Vader_SIA(data):
    for text in data:
        pol_score = SIA().polarity_scores(text) # run analysis
        results.append(pol_score)
             
    return(results)

In [None]:
Vader_SIA(title_data['title']).head()

In [None]:
Vader_SIA(body_data['body']).head()

In [None]:
body_data['compound'] = pd.DataFrame(results)['compound']
body_data['positive'] = pd.DataFrame(results)['pos']
body_data['negative'] = pd.DataFrame(results)['neg']
body_data['neutral'] = pd.DataFrame(results)['neu']

title_data['compound'] = pd.DataFrame(results)['compound']
title_data['positive'] = pd.DataFrame(results)['pos']
title_data['negative'] = pd.DataFrame(results)['neg']
title_data['neutral'] = pd.DataFrame(results)['neu']
title_data

In [None]:
sentiment = []
def Final_Sentiment(data):
    for score in data:
        if score >=0.05:
            sentiment.append('Positive')
        elif score <= - 0.05:
            sentiment.append('Negative')
        else:
            sentiment.append('Neutral')
    #return(sentiment)

In [None]:
Final_Sentiment(title_data['compound'])
title_data['sentiment'] = pd.DataFrame(sentiment)
title_data

In [None]:
Final_Sentiment(body_data['compound'])
body_data['sentiment'] = pd.DataFrame(sentiment)
body_data

In [None]:
def sentiment_plot(data, feature, title):
    counts = data[feature].value_counts()
    percent = counts/sum(counts)
    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15, 6))
    counts.plot(kind='bar', ax=ax1, color='orange')
    percent.plot(kind='bar', ax=ax2, color='red')
    ax1.set_ylabel(f'Counts : {title} sentiment', size=12)
    ax2.set_ylabel(f'Percentage : {title} sentiment', size=12)
    plt.tight_layout()
    plt.show()

In [None]:
sentiment_plot(title_data, 'sentiment', 'Title')
sentiment_plot(body_data, 'sentiment', 'Title')

In [None]:
wordcloud(title_data.loc[title_data['sentiment']=='Positive', 'title'], 
               title = 'Most used positive sentiment words in titles')
wordcloud(title_data.loc[title_data['sentiment']=='Negative', 'title'], 
               title = 'Most used negative sentiment words in titles')

In [None]:
wordcloud(body_data.loc[body_data['sentiment']=='Positive', 'body'], 
               title = 'Most used positive sentiment words in bodies')
wordcloud(body_data.loc[body_data['sentiment']=='Negative', 'body'], 
               title = 'Most used negative sentiment words in bodies')

In [None]:
ax = sns.boxplot(x=title_data['sentiment'], y=title_data['compound'], data=title_data)
ax.set_title('Sentiment strength')

In [None]:
ax = sns.boxplot(x=body_data['sentiment'], y=body_data['compound'], data=body_data)
ax.set_title('Sentiment strength')

In [None]:
def Textblob_Polarity(text):
    blob = TextBlob(text)
    polarity_score = 0
    for sentence in blob.sentences:
        polarity_score += sentence.sentiment.polarity
    return polarity_score

def TextBlob_Subjectivity(text):
    blob = TextBlob(text)
    subjectivity_score = 0
    for sentence in blob.sentences:
        subjectivity_score += sentence.sentiment.subjectivity
    return subjectivity_score

In [None]:
title_data['polarity'] = title_data['title'].apply(lambda x: Textblob_Polarity(x))
title_data['subjectivity'] = title_data['title'].apply(lambda x: TextBlob_Subjectivity(x))

body_data['polarity'] = body_data['body'].apply(lambda x: Textblob_Polarity(x))
body_data['subjectivity'] = body_data['body'].apply(lambda x: TextBlob_Subjectivity(x))

In [None]:
title_data

In [None]:
def textblob_sentiment(df, feature, title):
    polarity = df['polarity']
    subjectivity = df['subjectivity']
    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))
    polarity.plot(kind='kde', ax=ax1, color='magenta')
    subjectivity.plot(kind='kde', ax=ax2, color='green')
    ax1.set_ylabel(f'Sentiment polarity in {title}', size=12)
    ax2.set_ylabel(f'Sentiment subjectivity in {title}', size=12)
    plt.tight_layout()
    plt.show()

In [None]:
textblob_sentiment(title_data, 'title', 'title')
textblob_sentiment(body_data, 'body', 'body')

In [None]:
title_data['date']=data['timestamp'].dt.day
title_data['hour']=data['timestamp'].dt.hour
by_day = title_data.groupby("date").mean()

fig = by_day.reset_index().plot(y = "compound", labels={
                     "compound": "Score ratios of each sentiment", 
                     "index": "Days from 1/28/2021"
                 },
                title="Title Sentiment scores by day")
fig.add_scatter(y=by_day['subjectivity'], mode='lines', name = "Subjectivity")
fig.add_scatter(y=by_day['compound'], mode='lines', name = "Compound")
fig.add_scatter(y=by_day['polarity'], mode='lines', name = "Polarity")
fig.add_scatter(y=by_day['positive'], mode='lines', name = "Positive")
fig.add_scatter(y=by_day['negative'], mode='lines', name = "Negative")



fig.show()

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
text = 'Hello my dear algotraders! I hope that this article is enjoyable to read. I wish you the best of luck in your endeavours'

sentence_tokenization = sent_tokenize(text)
sentence_tokenization

In [None]:
word_tokenization = word_tokenize(text)
word_tokenization

In [None]:
freq_dist = FreqDist(word_tokenization)
freq_dist.most_common(1)

In [None]:
stop_words=set(stopwords.words("english"))
stop_words

In [None]:
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned"]

for w in example_words:
    print(ps.stem(w))

In [None]:
lem = WordNetLemmatizer()

word = "frying"
print("Lemmatization:",lem.lemmatize(word,"v"))
print("Stemming:",ps.stem(word))

In [None]:
model_data = body_data[['body', 'sentiment']].copy()

def recoding(data):
    if data == 'Positive':
        return 1
    elif data == 'Negative':
        return -1
    else:
        return 0

model_data['sentiment'] = model_data['sentiment'].apply(recoding)
model_data.tail()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(stop_words='english')
text_matrix = cv.fit_transform(model_data['body'])
text_matrix

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    text_matrix, model_data['sentiment'], test_size=0.2, random_state=42)

In [None]:
clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_train)
print("Accuracy of classifier:",metrics.accuracy_score(y_train, predicted))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_id = TfidfVectorizer()
text_td_id_matrix = tf_id.fit_transform(model_data['body'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    text_td_id_matrix, model_data['sentiment'], test_size=0.2, random_state=42)

In [None]:
clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_train)
print("Accuracy of classifier:",metrics.accuracy_score(y_train, predicted))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    text_td_id_matrix, model_data['sentiment'], test_size=0.2, random_state=42)

logreg = LogisticRegression(solver='liblinear', multi_class='auto').fit(X_train, y_train)

predicted = logreg.predict(X_train)
print("Accuracy of classifier:",metrics.accuracy_score(y_train, predicted))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    text_matrix, model_data['sentiment'], test_size=0.2, random_state=42)

logreg = LogisticRegression(solver='liblinear', multi_class='auto').fit(X_train, y_train)

predicted = logreg.predict(X_train)
print("Accuracy of classifier:",metrics.accuracy_score(y_train, predicted))

In [None]:
import scikitplot as skplt
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import plot_confusion_matrix

In [None]:
prediction = logreg.predict(X_test)
print("Accuracy:")
response = accuracy_score(y_test,prediction)
print(response)

prediction = dict()
prediction['Logistic'] = logreg.predict(X_test)

confusion_matrix = cm(y_test, prediction['Logistic'])
print(confusion_matrix)


skplt.metrics.plot_confusion_matrix(y_test, prediction['Logistic'])
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,prediction['Logistic'],normalize=True)
plt.show()

print(classification_report(y_test, prediction['Logistic']))

In [None]:
model_data = title_data[['title', 'sentiment']].copy()
model_data['sentiment'] = model_data['sentiment'].apply(recoding)
model_data = model_data[model_data.sentiment != 0]
model_data

In [None]:
title_matrix = cv.fit_transform(model_data['title'])
title_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    title_matrix, model_data['sentiment'], test_size=0.2, random_state=42)

In [None]:
logreg = LogisticRegression(solver='liblinear', multi_class='auto').fit(X_train, y_train)

prediction = logreg.predict(X_test)
print("Accuracy:")
response = accuracy_score(y_test,prediction)
print(response)

prediction = dict()
prediction['Logistic'] = logreg.predict(X_test)

confusion_matrix = cm(y_test, prediction['Logistic'])
print(confusion_matrix)


skplt.metrics.plot_confusion_matrix(y_test, prediction['Logistic'])
plt.show()
skplt.metrics.plot_confusion_matrix(y_test,prediction['Logistic'],normalize=True)
plt.show()

print(classification_report(y_test, prediction['Logistic']))