In [1]:
import nltk as nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
# Sample text and showing sentences
# https://twitter.com/MediaSequitur/status/1513815410701127680

textsample = """'RT @NeedsMother: HAPPENING👇AGAIN RIGHT NOW\nALLL BASED OFF BLM LIES TAKING OUR BEAUTIFUL PRECIOUS MUSTANGS AWAY\nTAKING OUR SWEET BURROS TOO…'"""

from nltk.tokenize import TweetTokenizer, sent_tokenize

tokenizer_words = TweetTokenizer()
tokens_sentences = [tokenizer_words.tokenize(t) for t in 
nltk.sent_tokenize(textsample)]
print(tokens_sentences)

[["'", 'RT', '@NeedsMother', ':', 'HAPPENING', '👇', 'AGAIN', 'RIGHT', 'NOW', 'ALLL', 'BASED', 'OFF', 'BLM', 'LIES', 'TAKING', 'OUR', 'BEAUTIFUL', 'PRECIOUS', 'MUSTANGS', 'AWAY', 'TAKING', 'OUR', 'SWEET', 'BURROS', 'TOO', '…', "'"]]


In [4]:
# Getting relevant words
words_in_tweet = TweetTokenizer().tokenize(textsample)

stop_words = set(stopwords.words("english"))

filtered_list = [word for word in words_in_tweet if word.casefold() not in stop_words]

In [5]:
# POS
nltk.pos_tag(filtered_list)

[("'", 'POS'),
 ('RT', 'NNP'),
 ('@NeedsMother', 'NN'),
 (':', ':'),
 ('HAPPENING', 'NN'),
 ('👇', 'NN'),
 ('RIGHT', 'NNP'),
 ('ALLL', 'NNP'),
 ('BASED', 'NNP'),
 ('BLM', 'NNP'),
 ('LIES', 'NNP'),
 ('TAKING', 'NNP'),
 ('BEAUTIFUL', 'NNP'),
 ('PRECIOUS', 'NNP'),
 ('MUSTANGS', 'NNP'),
 ('AWAY', 'NNP'),
 ('TAKING', 'NNP'),
 ('SWEET', 'NNP'),
 ('BURROS', 'NNP'),
 ('…', 'NNP'),
 ("'", 'POS')]

In [6]:
# Lemmatizing for sentiment
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_list]

lemmatized_words

["'",
 'RT',
 '@NeedsMother',
 ':',
 'HAPPENING',
 '👇',
 'RIGHT',
 'ALLL',
 'BASED',
 'BLM',
 'LIES',
 'TAKING',
 'BEAUTIFUL',
 'PRECIOUS',
 'MUSTANGS',
 'AWAY',
 'TAKING',
 'SWEET',
 'BURROS',
 '…',
 "'"]

In [26]:
# May want to look into concordance and dispersion/frequency plots

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
sia.polarity_scores(textsample)

{'neg': 0.106, 'neu': 0.51, 'pos': 0.384, 'compound': 0.8825}

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...


True