In [None]:
!pip install vaderSentiment



In [None]:
import pandas as pd
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import spacy
from datetime import datetime
import re
import string
import unicodedata
import random
from gensim import corpora, models

### Sentiment analysis
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

### Load Reddit Data

In [None]:
# import pandas as pd
url = 'https://github.com/EunCheolChoi0123/COMM557Tutorial/raw/main/Tutorial%205%20NLP%20(2)%20Topic%20Modeling/rdatascience_submission_comments_df.csv'
df  = pd.read_csv(url)
df = df[~df.body.isna()]
docs = df.body.to_list()

### Preprocessing text

In [None]:
# remove HTML links and special characters
# choose functions that suit your neeeds

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ' ')
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
def process_posts(post):
    post = strip_links(post)
    post = remove_special_characters(post)
    return post

In [None]:
processed_docs = [process_posts(doc) for doc in docs]

In [None]:
processed_docs[30]

'Have you tried using SMOTE or something to generate synthetic examples of the less represented classes'

### Tokenizing and Corpus Creation

### Stemming/Lemming

In [None]:
# Stemming / Lemming

### loading a spacy language model
# python -m spacy download en_core_web_sm
# https://spacy.io/models/en

nlp = spacy.load('en_core_web_sm')

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
print(processed_docs[30])
print('------\nstemmed post:')
print(simple_stemmer(processed_docs[30]))
print('------\nlemmatized post:')
print(lemmatize_text(processed_docs[30]))

Have you tried using SMOTE or something to generate synthetic examples of the less represented classes
------
stemmed post:
have you tri use smote or someth to gener synthet exampl of the less repres class
------
lemmatized post:
have you try use SMOTE or something to generate synthetic example of the less represented class


In [None]:
### Run this the first time
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [None]:
#Create corupus of all words
words_corpus = []
for elem in processed_docs:
    # remove stop words
    elem = remove_stopwords(elem)
    # lemmatize text
    elem = lemmatize_text(elem)
    words_corpus.append(elem.lower().split())
print(len(words_corpus))

dictionary = corpora.Dictionary(words_corpus)
print(len(dictionary))

579
3333


In [None]:
dictionary.num_docs, dictionary.num_pos

(579, 15666)

In [None]:
# get token-id mapping
dictionary.token2id

{'academic': 0,
 'available': 1,
 'big': 2,
 'chicago': 3,
 'city': 4,
 'company': 5,
 'data': 6,
 'datum': 7,
 'definitely': 8,
 'develop': 9,
 'different': 10,
 'environmental': 11,
 'etc': 12,
 'every': 13,
 'expect': 14,
 'expertise': 15,
 'find': 16,
 'footprint': 17,
 'get': 18,
 'gig': 19,
 'heavy': 20,
 'highly': 21,
 'informatic': 22,
 'informatics': 23,
 'innovation': 24,
 'interesting': 25,
 'learn': 26,
 'like': 27,
 'london': 28,
 'look': 29,
 'machine': 30,
 'morphology': 31,
 'name': 32,
 'nonprofit': 33,
 'ny': 34,
 'other': 35,
 'package': 36,
 'phd': 37,
 'planning': 38,
 'private': 39,
 'publicly': 40,
 'python': 41,
 'r': 42,
 'reach': 43,
 'refer': 44,
 'regional': 45,
 'require': 46,
 'research': 47,
 'role': 48,
 's': 49,
 'science': 50,
 'specialize': 51,
 'start': 52,
 'student': 53,
 'technique': 54,
 'there': 55,
 'think': 56,
 'time': 57,
 'ton': 58,
 'urban': 59,
 'value': 60,
 'well': 61,
 'work': 62,
 'adnauseum': 63,
 'album': 64,
 'also': 65,
 'analysis

In [None]:
dictionary.most_common(10)

[('datum', 245),
 ('not', 219),
 ('model', 202),
 ('work', 146),
 ('do', 143),
 ('get', 135),
 ('job', 131),
 ('data', 107),
 ('good', 105),
 ('like', 103)]

In [None]:
# get bag-of-words repesentation of documents: list of (token_id, token_count) tuples
dictionary.doc2bow(words_corpus[0])

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 6),
 (5, 2),
 (6, 2),
 (7, 2),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 2),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 2),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 2),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 4),
 (60, 1),
 (61, 1),
 (62, 3)]

## Topic Modeling
# https://radimrehurek.com/gensim/models/ldamodel.html

In [None]:
?dictionary.filter_extremes

In [None]:
dictionary = corpora.Dictionary(words_corpus)
dictionary.filter_extremes(no_below=10, no_above=0.2, keep_n=1000)

corpus_bow = [dictionary.doc2bow(text) for text in words_corpus]

# Term Frequency - Inverse Document Frequency

from gensim import corpora, models

tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [None]:
corpus_tfidf.corpus

[[(0, 1),
  (1, 2),
  (2, 2),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 3)],
 [(28, 1),
  (29, 1),
  (30, 3),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 2),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 2),
  (49, 1),
  (50, 1)],
 [(28, 2),
  (33, 1),
  (36, 1),
  (51, 1),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 2),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1)],
 [(2, 1),
  (4, 1),
  (7, 1),
  (8, 1),
  (18, 8),
  (19, 1),
  (20, 1),
  (23, 1),
  (25, 1),
  (29, 1),
  (34, 3),
  (46, 1),
  (50, 2),
  (58, 2),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 2),
  (66, 1),
  (67, 1),
  (68, 5),
  (69, 1),
  (70, 1),
  (71, 1),

In [None]:
# Only 5 topics this time for simplicity
num_topics = 5
lda_model_tfidf = models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=4, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1, num_words=20):
    print('Topic: {} Word: {}'.format(idx, topic))



Topic: 0 Word: 0.027*"model" + 0.017*"good" + 0.015*"job" + 0.014*"yeah" + 0.013*"need" + 0.012*"read" + 0.012*"data" + 0.012*"course" + 0.012*"do" + 0.011*"ds" + 0.011*"bad" + 0.010*"think" + 0.010*"i" + 0.010*"get" + 0.010*"use" + 0.009*"tune" + 0.009*"m" + 0.009*"be" + 0.009*"like" + 0.009*"almost"
Topic: 1 Word: 0.015*"work" + 0.013*"use" + 0.011*"get" + 0.011*"ai" + 0.011*"matter" + 0.011*"job" + 0.010*"people" + 0.010*"data" + 0.009*"thing" + 0.009*"time" + 0.009*"science" + 0.009*"model" + 0.009*"say" + 0.008*"ds" + 0.008*"well" + 0.008*"math" + 0.008*"would" + 0.008*"see" + 0.008*"want" + 0.007*"try"
Topic: 2 Word: 0.015*"would" + 0.015*"s" + 0.014*"that" + 0.013*"get" + 0.013*"like" + 0.013*"good" + 0.012*"maybe" + 0.012*"project" + 0.012*"boss" + 0.011*"find" + 0.011*"look" + 0.011*"something" + 0.010*"do" + 0.010*"year" + 0.010*"job" + 0.010*"one" + 0.009*"other" + 0.009*"i" + 0.009*"help" + 0.009*"seem"
Topic: 3 Word: 0.044*"yes" + 0.026*"skill" + 0.017*"science" + 0.017*"l

## Visualizing Topic Modeling

In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [None]:
import pyLDAvis
import pyLDAvis.gensim

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)

  and should_run_async(code)


# On Your Own
1. Scrape 1,000 submissions from a subreddit using Tutorial 2 script. You don't have to scrape comments, user data, etc. for this session.
2. Conduct LDA Topic Modeling. How many topics make the best sense in your opinion?
3. (Optional) Name each topics.