<a href="https://colab.research.google.com/github/BachNguyenT/Project-Program-W23/blob/main/Project_Program.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Establishing a Reddit Instance with Praw**

In [None]:

!pip install praw
!pip install emoji
!pip install re


In [None]:
#For gathering data from Reddit
import praw #Python Reddit API wrapper
import pandas as pd #DataFrame etc.
from praw.models import MoreComments
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import FreqDist
import emoji #Remove emojis
import re #Remove links
import en_core_web_sm
import spacy
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
#Create a reddit connection with reddit API details
reddit = praw.Reddit(client_id='fpnqOLAN-SWfPgUnpLkfNg',
                     client_secret='yhu-5vXFUVbS54RWBVTPcUA7_6jTRw',
                     user_agent='Project Program UW')

**2. Obtaining Comments From A Post**

In [None]:
subreddit = reddit.subreddit('uwaterloo')
for submission in subreddit.hot(limit = 100):
  print(submission.title) #Output: the submissions title
  print('Submission ID =', submission.id, '\n') #Output: the submission's id

**Defining a Submission Object**

In [None]:
Post1 = reddit.submission(id='103fj53' )

**Storing all Comments Scraped in a List**

In [None]:
Comments_All = []
Post1.comments.replace_more(limit = 100)
for comments in Post1.comments.list():
  Comments_All.append(comments.body)

print(Comments_All, '\n')
print('Total Comments Scraped = ', len(Comments_All))

**3. Preprocess The Comments**
- removing emojis
- tokenizing, removing links, etc
- removing stopwards
- normalizing words via lemmatizing


**Preprocessing String - Convert To A String Object**

In [None]:
List1 = Comments_All
List1 = [str(i) for i in List1] #Map to a list of strings
string_uncleaned = ' , '.join(List1) #join all the strings separated by a comma
string_uncleaned

Removing emojis

In [None]:
string_emojiless = emoji.demojize(string_uncleaned)
#If want to remove emoji codes too
#example: ðŸ˜Š becomes :smiling_face_with_smiling_eyes:
#string_emojiless = re.sub(r':[a-z_]+:', '', string_emojiless)

In [None]:
#Tokenizing breaks apart every word in the string
#Into an individual word
#Which would then carry its own 'pos' or 'neg' sentiment
#Based on our sentiment analyzer later

Tokenizing & Cleaning Strings

In [None]:
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
tokenized_string = tokenizer.tokenize(string_emojiless)
print(tokenized_string)

Converting Tokens Into Lowercase

In [None]:
lower_string_tokenized = [word.lower() for word in tokenized_string]
print(lower_string_tokenized)

Removing Stopwords

In [None]:
nlp = en_core_web_sm.load()

all_stopwords = nlp.Defaults.stop_words

text = lower_string_tokenized
tokens_without_sw = [word for word in text if not word in all_stopwords]

print(tokens_without_sw)

Normalizing Words via Lemmatizing

In [None]:
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = ([lemmatizer.lemmatize(w) for w in tokens_without_sw])
print(lemmatized_tokens)

Normalizing Words via Stemming

In [None]:
stemmer = PorterStemmer()

stem_tokens = ([stemmer.stem(s) for s in tokens_without_sw])
print(stem_tokens)

In [None]:
cleaned_output = lemmatized_tokens

In [None]:
print("Original length of words = ", (len(string_uncleaned)))
print("Number of words after removing emojis = ", (len(string_emojiless)))
print("Number of words after removing tokenizing and cleaning = ", (len(tokenized_string)))
print("Number of words after removing tokenizing, cleaning and removing stop words = ", (len(tokens_without_sw)))
print("Number of words after removing tokenizing, cleaning, removing stop words and lemmatized = ", (len(lemmatized_tokens)))
print("Number of words for final output = ", (len(cleaned_output)))

**4. Apply A Sentiment Analyzer (VADER)**

In [None]:
"""
Calculate each tokenized word's polarity scores using the VADER 
(Valence Aware Dictionary for Sentiment Reasoning) model
The polarity scores measure the positivity and negativity for each word.
Compound score:normalized to be between -1 (neg) and +1 (pos). 
This provides a single unidimensional measure of sentiment for a given word.

We would then store this as a data frame object.
"""

Polarity Score of Words

In [None]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()

#Define custom polarity scores
custom_polarity = {
    'lol' : -0.2
}

sia.lexicon.update(custom_polarity)

results = []

for sentences in cleaned_output:
  pol_score = sia.polarity_scores(sentences)
  pol_score['words'] = sentences
  results.append(pol_score)

pd.set_option('display.max_columns', None, 'max_colwidth', None)
df = pd.DataFrame.from_records(results)
df

In [None]:
#Filter words based on the compound score and our criteria
#We use a compound score of +- 0.10
#Threshold values:
"""
>=0.05 = pos
<= -0.05 = neg
0.05 ~ -0.05 = neutral
"""

df['label'] = 0
df.loc[df['compound'] > 0.10, 'label'] = 1
df.loc[df['compound'] < 0, 'label'] = -1
df.head()

**Representation of Sentiment Results**

In [None]:
print(df.label.value_counts())

Representation Of Sentiments of Words

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fix, ax = plt.subplots(figsize=(8,8))

counts = df.label.value_counts(normalize = True) * 100

sns.barplot(x=counts.index, y = counts, ax = ax)

ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
ax.set_ylabel("Percentage")

plt.show()

In [None]:
df_positive_negative = df.loc[df['label'] != 0]
df_positive_negative.head()

In [None]:
print(df_positive_negative.label.value_counts())

In [None]:
fig, ax = plt.subplots(figsize = (8,8))
counts = df_positive_negative.label.value_counts(normalize = True)*100

sns.barplot(x=counts.index, y=counts, ax=ax)

ax.set_xticklabels(['Negative', 'Positive'])
ax.set_ylabel("Percentage")

plt.show()

**Simple Visulizations**

Frequency Distribution of Most Common Positive Words

In [None]:
positive_words = list(df.loc[df['label'] == 1].words)
print(positive_words)

In [None]:
positive_frequency = FreqDist(positive_words)
pos_freq = positive_frequency.most_common(30)
pos_freq

Frequency Distribution of Most Common Negative Words

In [None]:
negative_words = list(df.loc[df['label'] == -1].words)
print(negative_words)

In [None]:
negative_frequency = FreqDist(negative_words)
neg_freq = negative_frequency.most_common(30)
neg_freq

**Visualization Via WordCloud**

In [None]:
Pos_words = [str(p) for p in pos_freq]
Pos_words_string = ' , '.join(Pos_words)

In [None]:
Neg_words = [str(n) for n in neg_freq]
Neg_words_string = ' , '.join(Neg_words)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

#Create and generate a word cloud image
wordcloud_positive = WordCloud(background_color = "white").generate(Pos_words_string)
wordcloud_negative = WordCloud().generate(Neg_words_string)

#Display the generated image for Positive words
plt.imshow(wordcloud_positive, interpolation="bilinear")
plt.axis("off")
plt.show()

#Display the generated image for Negative words
plt.imshow(wordcloud_negative, interpolation = 'bilinear')
plt.axis("off")
plt.show()

**Bar Chart Of Most Common Positive and Negative Words By Count**




In [None]:
import plotly.express as px

pos_freq_df = pd.DataFrame(pos_freq)
pos_freq_df = pos_freq_df.rename(columns = {0: 'Bar Graph Of Frequent Words', 1: 'Count'}, inplace = False)

fig = px.bar(pos_freq_df, x = 'Bar Graph Of Frequent Words', y = 'Count', title = 'Commonly Used Positive Words By Count')
fig.show()

neg_freq_df = pd.DataFrame(neg_freq)
neg_freq_df = neg_freq_df.rename(columns = {0: 'Bar Graph Of Frequent Words', 1: 'Count'}, inplace = False)

fig = px.bar(neg_freq_df, x = 'Bar Graph Of Frequent Words', y = 'Count', title = 'Commonly Used Negative Words By Count')
fig.show()