In [4]:
import re
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer

In [2]:
with open('./reddit_sarcasm.txt') as reddit_file:
  content = reddit_file.read()
with open('./stop_words.txt') as stopword_file:
  stop_words = stopword_file.readlines()

raw_tokens = [ele.lower() for ele in word_tokenize(content)]
stop_words = set([word[:-1] for word in stop_words])

In [3]:
raw_dict = dict(Counter(raw_tokens))
no_punc_dict = raw_dict.copy()
no_stop_dict = raw_dict.copy()
no_stop_punc_dict = raw_dict.copy()
n_once = 0

for word, count in raw_dict.items():

  if count == 1: n_once += 1

  if re.match('\w+', word) == None:
    del no_punc_dict[word]
    del no_stop_punc_dict[word]

  if word in stop_words:
    del no_stop_dict[word]
    del no_stop_punc_dict[word]

n_tokens_type = len(raw_dict)
n_tokens_type_no_punc = len(no_punc_dict)
n_tokens_type_no_stop = len(no_stop_dict)
n_tokens_type_no_punc_stop = len(no_stop_punc_dict)
n_tokens = sum(count for count in raw_dict.values())
n_tokens_no_stop = sum(count for count in no_stop_dict.values())
n_tokens_no_punc = sum(count for count in no_punc_dict.values())
n_tokens_no_stop_punc = sum(count for count in no_stop_punc_dict.values())
raw_sorted = dict(sorted(raw_dict.items(), key=lambda x: x[1], reverse=True))
no_punc_sorted = dict(sorted(no_punc_dict.items(), key=lambda x: x[1], reverse=True))
no_stop_sorted = dict(sorted(no_stop_dict.items(), key=lambda x: x[1], reverse=True))
no_punc_stop_sorted = dict(sorted(no_stop_punc_dict.items(), key=lambda x: x[1], reverse=True))

print(f'# of tokens: { n_tokens }')
print(f'# of types: { n_tokens_type }')
print(f'type/token ratio: { n_tokens_type / n_tokens }')
print(f'# of words appearing once: {n_once}')
print(f'# of words (excluding punctuation): { n_tokens_no_punc }')
print(f'type/token ratio (excluding punctuation): { n_tokens_type_no_punc / n_tokens_no_punc }')
print(f'Top 3 most frequent words and their frequencies: { list(raw_sorted.items())[:3] }')
print(f'type/token ratio (excluding punctuation and stopwords): { n_tokens_type_no_punc_stop/ n_tokens_no_stop_punc }')
print(f'Top 3 most frequent words and their frequencies (excluding stopwords): { list(no_punc_stop_sorted.items())[:3] }')

# of tokens: 2915727
# of types: 83226
type/token ratio: 0.028543824576169167
# of words appearing once: 44380
# of words (excluding punctuation): 2431544
type/token ratio (excluding punctuation): 0.033083917050236396
Top 3 most frequent words and their frequencies: [('.', 122712), ('the', 92876), ("''", 85048)]
type/token ratio (excluding punctuation and stopwords): 0.07924873707884822
Top 3 most frequent words and their frequencies (excluding stopwords): [("n't", 22721), ('people', 8276), ('think', 5718)]


In [14]:
with open('./reddit_sarcasm.txt') as reddit_file:
  sentences = reddit_file.readlines()
no_punc_tokenizer = RegexpTokenizer(r'\w+')
bigrams = []
for sentence in sentences:
    filtered_sent = []
    words = no_punc_tokenizer.tokenize(sentence)
    words = [ele.lower() for ele in words]
    for word in words:
        if word not in stop_words:
            filtered_sent.append(word)
    bigrams += list(zip(words, words[1:]))
bigram_counts = dict(Counter(bigrams))
bigram_counts_sorted = dict(sorted(bigram_counts.items(), key=lambda x:x[1], reverse=True))
print(f'Top 3 most frequent bigrams and their frequencies: {list(bigram_counts_sorted.items())[:3]}')

[(('it', 's'), 8979), (('don', 't'), 7518), (('in', 'the'), 7386)]
