In [1]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
import pandas as pd
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import spacy
from datetime import datetime
import re
import string
import unicodedata
import random
from gensim import corpora

### Sentiment analysis
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

### Load Reddit data

In [3]:
df = pd.read_csv("/content/rdatascience_submission_comments_df.csv", index_col=[0])
df['body'] = df['body'].astype('str')
df['body']

0                           I think it’s just a bad time.
1       I’m on the hunt as well atm but with 3 YOE and...
2       Data engineering is not really a natural progr...
3       If you haven't yet check out r/dataengineering...
4       Contractor at FAANg is definitely not somethin...
                              ...                        
1136    In most cases, the company extends these offer...
1137    I wanna hear more about the consultant route o...
1138    My thought process was if a company is willing...
1139                                             Yea sure
1140    An intern is temporary. A minor waste of resou...
Name: body, Length: 1141, dtype: object

### Preprocessing text

In [4]:
# remove HTML links and special characters
# choose functions that suit your neeeds

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ' ')
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [5]:
def process_posts(post):
    post = strip_links(post)
    post = remove_special_characters(post)
    return post

In [6]:
df.loc[:, 'body'] = df.body.apply(process_posts)
df.body

0                             I think its just a bad time
1       Im on the hunt as well atm but with 3 YOE and ...
2       Data engineering is not really a natural progr...
3       If you havent yet check out rdataengineering\n...
4       Contractor at FAANg is definitely not somethin...
                              ...                        
1136    In most cases the company extends these offers...
1137    I wanna hear more about the consultant route o...
1138    My thought process was if a company is willing...
1139                                             Yea sure
1140    An intern is temporary A minor waste of resour...
Name: body, Length: 1141, dtype: object

In [7]:
# generate a random submission for sentiment analysis:
i = random.choice(df.index)
post = df.loc[i, 'body']
print(post)

Also the program in GT starts in a year if I applied now Is there a way to start before that Is it difficult to get accepted into the program


### Sentiment Analysis
#### TextBlob

In [8]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('movie_reviews')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [9]:
blob = TextBlob(post)
blob.sentiment

Sentiment(polarity=-0.5, subjectivity=1.0)

Textblob sentiment output:

Polarity in [-1, 1] := [most negative, most positive]

Subjectivity in [0, 1] := [factual, personal opinion]

In [10]:
blob.tokens

WordList(['Also', 'the', 'program', 'in', 'GT', 'starts', 'in', 'a', 'year', 'if', 'I', 'applied', 'now', 'Is', 'there', 'a', 'way', 'to', 'start', 'before', 'that', 'Is', 'it', 'difficult', 'to', 'get', 'accepted', 'into', 'the', 'program'])

In [11]:
blob.tags

[('Also', 'RB'),
 ('the', 'DT'),
 ('program', 'NN'),
 ('in', 'IN'),
 ('GT', 'NNP'),
 ('starts', 'NNS'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('year', 'NN'),
 ('if', 'IN'),
 ('I', 'PRP'),
 ('applied', 'VBD'),
 ('now', 'RB'),
 ('Is', 'VBZ'),
 ('there', 'EX'),
 ('a', 'DT'),
 ('way', 'NN'),
 ('to', 'TO'),
 ('start', 'VB'),
 ('before', 'IN'),
 ('that', 'DT'),
 ('Is', 'VBZ'),
 ('it', 'PRP'),
 ('difficult', 'JJ'),
 ('to', 'TO'),
 ('get', 'VB'),
 ('accepted', 'VBN'),
 ('into', 'IN'),
 ('the', 'DT'),
 ('program', 'NN')]

In [12]:
blob.noun_phrases

WordList(['gt'])

In [13]:
blob.words.count('pandemic')

0

In [14]:
test_msg1 = 'this is not the best football team'

In [15]:
blob = TextBlob(test_msg1)
blob.sentiment

Sentiment(polarity=1.0, subjectivity=0.3)

In [16]:
test_msg2 = 'hey this is not too bad'

In [17]:
blob = TextBlob(test_msg2)
blob.sentiment

Sentiment(polarity=-0.6999999999999998, subjectivity=0.6666666666666666)

In [18]:
# NaiveBayesAnalyzer option, trained on movie reviews
from textblob.sentiments import NaiveBayesAnalyzer

print(test_msg1)
blob = TextBlob(test_msg1, analyzer=NaiveBayesAnalyzer())
print(blob.sentiment)

print(test_msg2)
blob = TextBlob(test_msg2, analyzer=NaiveBayesAnalyzer())
print(blob.sentiment)

this is not the best football team
Sentiment(classification='neg', p_pos=0.41870201702509297, p_neg=0.5812979829749073)
hey this is not too bad
Sentiment(classification='neg', p_pos=0.21032203786065207, p_neg=0.7896779621393482)


In [19]:
blob = TextBlob(post, analyzer=NaiveBayesAnalyzer())
blob.sentiment

Sentiment(classification='pos', p_pos=0.8462095023744807, p_neg=0.15379049762552024)

Train your own classifers:
https://textblob.readthedocs.io/en/dev/api_reference.html#module-textblob.classifiers

#### VADER (Valence Aware Dictionary and Sentiment Reasoner)

In [20]:
#find sentiment vader
analyser = SentimentIntensityAnalyzer()

In [21]:
help(analyser.polarity_scores)

Help on method polarity_scores in module vaderSentiment.vaderSentiment:

polarity_scores(text) method of vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer instance
    Return a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative
    valence.



Compound scores in [-1, 1] := [most negative, most positive]

neg, neu, pos in [0, 1]

In [22]:
sentiment = analyser.polarity_scores('This is an example of a happy post')
print(sentiment)

{'neg': 0.0, 'neu': 0.654, 'pos': 0.346, 'compound': 0.5719}


In [23]:
sentiment['neg'] + sentiment['neu'] + sentiment['pos']

1.0

In [24]:
# impact of punctuation
sentiment = analyser.polarity_scores('This is an example of a happy post!')
print(sentiment)

{'neg': 0.0, 'neu': 0.637, 'pos': 0.363, 'compound': 0.6114}


In [25]:
# impact of emoji
sentiment = analyser.polarity_scores('This is an example of a happy 😁 ❤️ post! ')
print(sentiment)

{'neg': 0.0, 'neu': 0.65, 'pos': 0.35, 'compound': 0.7901}


In [26]:
print(test_msg1)
sentiment = analyser.polarity_scores(test_msg1)
print(sentiment)

print(test_msg2)
sentiment = analyser.polarity_scores(test_msg2)
print(sentiment)

this is not the best football team
{'neg': 0.36, 'neu': 0.64, 'pos': 0.0, 'compound': -0.5216}
hey this is not too bad
{'neg': 0.0, 'neu': 0.637, 'pos': 0.363, 'compound': 0.431}


In [27]:
print(post)
sentiment = analyser.polarity_scores(post)
print(sentiment)

Also the program in GT starts in a year if I applied now Is there a way to start before that Is it difficult to get accepted into the program
{'neg': 0.073, 'neu': 0.784, 'pos': 0.143, 'compound': 0.347}


#### Extension

Twitter roBERTa

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

Multilingual Twitter roBERTa: 8 languages (Ar, En, Fr, De, Hi, It, Sp, Pt)

https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment


### Stemming/Lemming

In [28]:
# Stemming / Lemming

### loading a spacy language model
# python -m spacy download en_core_web_sm
# https://spacy.io/models/en

nlp = spacy.load('en_core_web_sm')

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [29]:
print(post)
print('------\nstemmed post:')
print(simple_stemmer(post))
print('------\nlemmatized post:')
print(lemmatize_text(post))

Also the program in GT starts in a year if I applied now Is there a way to start before that Is it difficult to get accepted into the program
------
stemmed post:
also the program in gt start in a year if i appli now is there a way to start befor that is it difficult to get accept into the program
------
lemmatized post:
also the program in GT start in a year if I apply now be there a way to start before that be it difficult to get accept into the program


### Tokenizing and Corpus Creation

In [30]:
### Run this the first time
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [31]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [32]:
#Create corupus of all words
words_corpus = []
for elem in df.body:
    # remove stop words
    elem = remove_stopwords(elem)
    # lemmatize text
    elem = lemmatize_text(elem)
    words_corpus.append(elem.lower().split())
print(len(words_corpus))

dictionary = corpora.Dictionary(words_corpus)
print(len(dictionary))

1141
4855


In [33]:
dictionary.num_docs, dictionary.num_pos

(1141, 32087)

In [34]:
# get token-id mapping
dictionary.token2id

{'bad': 0,
 'think': 1,
 'time': 2,
 '3': 3,
 '400500': 4,
 'ago': 5,
 'application': 6,
 'atm': 7,
 'ats': 8,
 'back': 9,
 'bsc': 10,
 'call': 11,
 'case': 12,
 'get': 13,
 'hunt': 14,
 'i': 15,
 'issue': 16,
 'm': 17,
 'number': 18,
 'offer': 19,
 'pass': 20,
 'place': 21,
 'probably': 22,
 'put': 23,
 'resume': 24,
 'rework': 25,
 'scanner': 26,
 'serious': 27,
 'significant': 28,
 'start': 29,
 'tough': 30,
 've': 31,
 'week': 32,
 'well': 33,
 'yoe': 34,
 'analyst': 35,
 'code': 36,
 'computer': 37,
 'confidently': 38,
 'cron': 39,
 'datum': 40,
 'engineering': 41,
 'etc': 42,
 'fake': 43,
 'infrastructre': 44,
 'job': 45,
 'kafka': 46,
 'know': 47,
 'knowledge': 48,
 'kubernetes': 49,
 'natural': 50,
 'need': 51,
 'progression': 52,
 'pull': 53,
 'really': 54,
 'require': 55,
 'sas': 56,
 'say': 57,
 'science': 58,
 'spark': 59,
 'specialized': 60,
 'stream': 61,
 'thing': 62,
 'work': 63,
 '35': 64,
 '45': 65,
 'airflow': 66,
 'also': 67,
 'another': 68,
 'aws': 69,
 'check': 70

In [35]:
dictionary.most_common(10)

[('not', 486),
 ('datum', 363),
 ('i', 342),
 ('work', 339),
 ('get', 319),
 ('do', 306),
 ('use', 274),
 ('job', 252),
 ('like', 222),
 ('would', 214)]

In [36]:
# get bag-of-words repesentation of documents: list of (token_id, token_count) tuples
dictionary.doc2bow(words_corpus[0])

[(0, 1), (1, 1), (2, 1)]

## Topic Modeling

In [37]:
help(dictionary.filter_extremes)

Help on method filter_extremes in module gensim.corpora.dictionary:

filter_extremes(no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None) method of gensim.corpora.dictionary.Dictionary instance
    Filter out tokens in the dictionary by their frequency.
    
    Parameters
    ----------
    no_below : int, optional
        Keep tokens which are contained in at least `no_below` documents.
    no_above : float, optional
        Keep tokens which are contained in no more than `no_above` documents
        (fraction of total corpus size, not an absolute number).
    keep_n : int, optional
        Keep only the first `keep_n` most frequent tokens.
    keep_tokens : iterable of str
        Iterable of tokens that **must** stay in dictionary after filtering.
    
    Notes
    -----
    This removes all tokens in the dictionary that are:
    
    #. Less frequent than `no_below` documents (absolute number, e.g. `5`) or 
    
    #. More frequent than `no_above` documents (fraction of th

In [38]:
dictionary = corpora.Dictionary(words_corpus)
# dictionary.filter_extremes(no_below=10, no_above=0.2, keep_n=10000)

corpus_bow = [dictionary.doc2bow(text) for text in words_corpus]

# Term Frequency - Inverse Document Frequency

from gensim import corpora, models

# https://radimrehurek.com/gensim/models/ldamodel.html

tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [45]:
corpus_tfidf.corpus[100]

[(67, 1),
 (103, 1),
 (116, 2),
 (150, 1),
 (283, 1),
 (299, 1),
 (310, 1),
 (467, 1),
 (577, 1),
 (809, 2),
 (821, 1),
 (914, 1),
 (1226, 1),
 (1227, 1)]

In [40]:
# Only 5 topics this time for simplicity
num_topics = 5
lda_model_tfidf = models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=4, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))



Topic: 0 Word: 0.003*"use" + 0.002*"i" + 0.002*"not" + 0.002*"model" + 0.002*"datum" + 0.002*"]" + 0.002*"[" + 0.002*"m" + 0.002*"get" + 0.002*"would"
Topic: 1 Word: 0.003*"job" + 0.003*"not" + 0.003*"know" + 0.003*"datum" + 0.003*"]" + 0.003*"[" + 0.003*"use" + 0.003*"do" + 0.003*"work" + 0.002*"go"
Topic: 2 Word: 0.004*"thank" + 0.003*"datum" + 0.003*"job" + 0.003*"i" + 0.002*"think" + 0.002*"start" + 0.002*"try" + 0.002*"get" + 0.002*"science" + 0.002*"not"
Topic: 3 Word: 0.003*"not" + 0.003*"work" + 0.003*"experience" + 0.003*"do" + 0.003*"i" + 0.003*"year" + 0.002*"get" + 0.002*"job" + 0.002*"think" + 0.002*"s"
Topic: 4 Word: 0.003*"datum" + 0.002*"i" + 0.002*"like" + 0.002*"think" + 0.002*"get" + 0.002*"job" + 0.002*"work" + 0.002*"not" + 0.002*"would" + 0.002*"m"


## Parts of Speech Tagging

In [41]:
sentence = 'London is the capital and most populous city of England.'
# sentence = post
sentence_nlp = nlp(sentence)

In [42]:
from spacy import displacy
displacy.render(sentence_nlp, jupyter=True,
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

In [43]:
# print named entities in article
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)

[(London, 'GPE'), (England, 'GPE')]
