In [33]:
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from typing import Iterator


def replace_characters(text: str) -> str:
    """
    Replace tricky punctuations that can mess up sentence tokenizers
    :param text: text with non-standard punctuations
    :return: text with standardized punctuations
    """
    replacement_rules = {'“': '"', '”': '"', '’': "'", '--': ','}
    for symbol, replacement in replacement_rules.items():
        text = text.replace(symbol, replacement)
    return text


def generate_tokenized_sentences(paragraph: str) -> Iterator[str]:
    """
    Tokenize each sentence in paragraph.
    For each sentence, tokenize each words and return the tokenized sentence one at a time.
    :param paragraph: text of paragraph
    """
    word_tokenizer = RegexpTokenizer(r'[-\'\w]+')

    for sentence in sent_tokenize(paragraph):
        tokenized_sentence = word_tokenizer.tokenize(sentence)
        if tokenized_sentence:
            tokenized_sentence.append('[END]')
            yield tokenized_sentence


def tokenize_raw_text(raw_text_path: str, token_text_path: str) -> None:
    """
    Read a input text file and write its content to an output text file in the form of tokenized sentences
    :param raw_text_path: path of raw input text file
    :param token_text_path: path of tokenized output text file
    """
    with open(raw_text_path) as read_handle, open(token_text_path, 'w') as write_handle:
        for paragraph in read_handle:
            paragraph = paragraph.lower()
            paragraph = replace_characters(paragraph)

            for tokenized_sentence in generate_tokenized_sentences(paragraph):
                write_handle.write(','.join(tokenized_sentence))
                write_handle.write('\n')


def get_tokenized_sentences(file_name: str) -> Iterator[str]:
    """
    Return tokenized sentence one at a time from a tokenized text
    :param file_name: path of tokenized text
    """
    # with open(file_name) as file_handle:
    #     for sentence in file_handle.read().splitlines():
    #         tokenized_sentence = sentence.split(',')
    #         yield tokenized_sentence

    for sent in file_name:
        tokenized_sentence = sent.split(',')
        yield tokenized_sentence

In [8]:
from math import log2


class UnigramCounter:
    def __init__(self, file_name: str) -> None:
        """
        Initialize unigram counter from tokenized text and count number of unigrams in text
        :param file_name: path of tokenized text. Each line is a sentence with tokens separated by comma.
        """
        self.sentence_generator = get_tokenized_sentences(file_name)
        self.count()

    def count(self) -> None:
        """
        Count number of unigrams in text, one sentence at a time
        """
        self.sentence_count = 0
        self.token_count = 0
        self.counts = {}

        for sentence in self.sentence_generator:
            self.sentence_count += 1
            self.token_count += len(sentence)
            for unigram in sentence:
                self.counts[unigram] = self.counts.get(unigram, 0) + 1


class UnigramModel:
    def __init__(self, train_counter: UnigramCounter) -> None:
        """
        Initialize unigram model from unigram counter, count the number of unique unigrams (vocab)
        :param train_counter: counted unigram counter
        """
        self.counter = train_counter
        self.counts = train_counter.counts.copy()
        self.counts['[UNK]'] = 0
        self.vocab = set(self.counts.keys())
        self.vocab_size = len(self.vocab)

    def train(self, k: int = 1) -> None:
        """
        For each unigram in the vocab, calculate its probability in the text
        :param k: smoothing pseudo-count for each unigram
        """
        self.probs = {}
        for unigram, unigram_count in self.counts.items():
            prob_nom = unigram_count + k
            prob_denom = self.counter.token_count + k * self.vocab_size
            self.probs[unigram] = prob_nom / prob_denom

    def evaluate(self, evaluation_counter: UnigramCounter) -> float:
        """
        Calculate the average log likelihood of the model on the evaluation text
        :param evaluation_counter: unigram counter for the text on which the model is evaluated on
        :return: average log likelihood that the unigram model assigns to the evaluation text
        """
        test_log_likelihood = 0
        test_counts = evaluation_counter.counts

        for unigram, test_count in test_counts.items():
            if unigram not in self.vocab:
                unigram = '[UNK]'
            train_prob = self.probs[unigram]
            log_likelihood = test_count * log2(train_prob)
            test_log_likelihood += log_likelihood

        avg_test_log_likelihood = test_log_likelihood / evaluation_counter.token_count
        return avg_test_log_likelihood

In [24]:
para = "Even if CB is not extended, like what you said I doubt things will go back to normal so quickly. If anything, I hope that the government implements some kind of stage by stage lifting of the CB measures. God knows what's going to happen when govt lifts CB in one go. Can you imagine? Hi, so as long as it is cohabitation for the full one month? I'm wondering if measures will be put into place to catch people not residing in their home address? This is -- perfect!"

text1 = "Why is TCM included in the first wave of things to be allowed to open? Genuinely curious. See you guys after 29 days guys! Meanwhile, please stay safe, stay healthy &amp; STAY AT HOME! We shall see who gains the most weight(probably me heheh) after this circuit breaker thingy 🤪🤪 #ayuatwork #monéquipe @… https://t.co/9aJ9l45Rwn."

In [32]:
para_replaced = replace_characters(para)
txt = []

for tokenized_sentence in generate_tokenized_sentences(para_replaced):
    sent = ','.join(tokenized_sentence)
    txt.append(sent)
print(txt)

['Even,if,CB,is,not,extended,like,what,you,said,I,doubt,things,will,go,back,to,normal,so,quickly,[END]', 'If,anything,I,hope,that,the,government,implements,some,kind,of,stage,by,stage,lifting,of,the,CB,measures,[END]', "God,knows,what's,going,to,happen,when,govt,lifts,CB,in,one,go,[END]", 'Can,you,imagine,[END]', 'Hi,so,as,long,as,it,is,cohabitation,for,the,full,one,month,[END]', "I'm,wondering,if,measures,will,be,put,into,place,to,catch,people,not,residing,in,their,home,address,[END]", 'This,is,perfect,[END]']


In [34]:
train_counter = UnigramCounter(txt)

In [35]:
print(train_counter.counts)

{'Even': 1, 'if': 2, 'CB': 3, 'is': 3, 'not': 2, 'extended': 1, 'like': 1, 'what': 1, 'you': 2, 'said': 1, 'I': 2, 'doubt': 1, 'things': 1, 'will': 2, 'go': 2, 'back': 1, 'to': 3, 'normal': 1, 'so': 2, 'quickly': 1, '[END]': 7, 'If': 1, 'anything': 1, 'hope': 1, 'that': 1, 'the': 3, 'government': 1, 'implements': 1, 'some': 1, 'kind': 1, 'of': 2, 'stage': 2, 'by': 1, 'lifting': 1, 'measures': 2, 'God': 1, 'knows': 1, "what's": 1, 'going': 1, 'happen': 1, 'when': 1, 'govt': 1, 'lifts': 1, 'in': 2, 'one': 2, 'Can': 1, 'imagine': 1, 'Hi': 1, 'as': 2, 'long': 1, 'it': 1, 'cohabitation': 1, 'for': 1, 'full': 1, 'month': 1, "I'm": 1, 'wondering': 1, 'be': 1, 'put': 1, 'into': 1, 'place': 1, 'catch': 1, 'people': 1, 'residing': 1, 'their': 1, 'home': 1, 'address': 1, 'This': 1, 'perfect': 1}


In [36]:
train_model = UnigramModel(train_counter)
train_model.train(k=1)
print(train_model.probs)

{'Even': 0.012048192771084338, 'if': 0.018072289156626505, 'CB': 0.024096385542168676, 'is': 0.024096385542168676, 'not': 0.018072289156626505, 'extended': 0.012048192771084338, 'like': 0.012048192771084338, 'what': 0.012048192771084338, 'you': 0.018072289156626505, 'said': 0.012048192771084338, 'I': 0.018072289156626505, 'doubt': 0.012048192771084338, 'things': 0.012048192771084338, 'will': 0.018072289156626505, 'go': 0.018072289156626505, 'back': 0.012048192771084338, 'to': 0.024096385542168676, 'normal': 0.012048192771084338, 'so': 0.018072289156626505, 'quickly': 0.012048192771084338, '[END]': 0.04819277108433735, 'If': 0.012048192771084338, 'anything': 0.012048192771084338, 'hope': 0.012048192771084338, 'that': 0.012048192771084338, 'the': 0.024096385542168676, 'government': 0.012048192771084338, 'implements': 0.012048192771084338, 'some': 0.012048192771084338, 'kind': 0.012048192771084338, 'of': 0.018072289156626505, 'stage': 0.018072289156626505, 'by': 0.012048192771084338, 'lif

In [37]:
text1_replaced = replace_characters(text1)
txt1 = ''

for tokenized_sentence in generate_tokenized_sentences(text1_replaced):
    sent = ','.join(tokenized_sentence)
    txt1 += sent

test1_counter = UnigramCounter(txt1)

In [38]:
text1_avg_log_likelihood = train_model.evaluate(test1_counter)

In [39]:
text1_avg_log_likelihood

-7.375039431346924

# News corpus unigram model

In [1]:
import pandas as pd
import numpy as np

In [66]:
df = pd.read_csv("/Users/chenjianyu/Downloads/News Article/articles1.csv")

In [48]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [67]:
df = df[['content','id', 'publication']]

In [68]:
corpus = []

for i, row in df.iterrows():
    article = row['content']

    article_replaced = replace_characters(article)

    for tokenized_sentence in generate_tokenized_sentences(article_replaced):
        s = ','.join(tokenized_sentence)
        corpus.append(s)

In [71]:
train_counter = UnigramCounter(corpus)

train_model = UnigramModel(train_counter)
train_model.train(k=1)

In [75]:
train_model.vocab_size

243313

In [3]:
from thoughtfulness import *

news_unigram = news_articles_unigram("./Data/News Article/articles1.csv")

In [4]:
news_unigram.vocab_size

243313

# Comment length

In [5]:
comments = pd.read_excel("./Data/Thoughtful Comments/Thoughtful Comments Labeled.xlsx", engine='openpyxl')

In [24]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', text)

def remove_hashtag_mentions_urls(text):
    return re.sub(r"(?:\@|\#|https?\://)\S+", "", text)


# Thoughtful comment feature 1
def get_sentence_length(text: str) -> int:
    text = replace_characters(text)
    text = remove_hashtag_mentions_urls(text)
    text = remove_emoji(text)

    word_tokenizer = RegexpTokenizer(r'[-\'\w]+')
    tokenized_text = word_tokenizer.tokenize(text)

    return len(tokenized_text)

In [25]:
comments_len = []
for i, row in comments.iterrows():
    comment = row['Comment']

    comment_len = get_sentence_length(comment)

    comments_len.append(comment_len)

In [27]:
comments['length'] = comments_len

In [30]:
comments.to_csv("./Data/Thoughtful Comments/Thoughtful Comments Features.csv")

# Comment loglikelihood

In [31]:
comments = pd.read_csv("./Data/Thoughtful Comments/Thoughtful Comments Features.csv")

In [36]:
comments.drop(comments[comments.length == 0].index, inplace=True)

In [37]:
comments_likelihood = []
for i, row in comments.iterrows():
    comment = row['Comment']
    
    comment_avg_loglikelihood = comment_likelihood(comment, news_unigram)

    comments_likelihood.append(comment_avg_loglikelihood)

In [38]:
comments['Avg_loglikelihood'] = comments_likelihood

In [39]:
comments.head()

Unnamed: 0.1,Unnamed: 0,Comment,Thoughtful?,length,Avg_loglikelihood
0,0,Herd immunity kicks in at around 70% so we don...,1,65,-18.838243
1,1,straight up get struck by lightning better.\n ...,0,7,-20.329141
2,2,The guide to getting the COVID-19 vaccine out ...,0,11,-17.852038
3,3,Other countries sud learn from Singapore,0,6,-18.359116
4,4,Catholic priest developing COVID-19 vaccine fo...,0,8,-18.318246


In [40]:
comments.rename(columns={'length':'Length'})

Unnamed: 0.1,Unnamed: 0,Comment,Thoughtful?,Length,Avg_loglikelihood
0,0,Herd immunity kicks in at around 70% so we don...,1,65,-18.838243
1,1,straight up get struck by lightning better.\n ...,0,7,-20.329141
2,2,The guide to getting the COVID-19 vaccine out ...,0,11,-17.852038
3,3,Other countries sud learn from Singapore,0,6,-18.359116
4,4,Catholic priest developing COVID-19 vaccine fo...,0,8,-18.318246
...,...,...,...,...,...
1995,1995,Day 11 of circuit breaker period. #safedistanc...,0,18,-18.061384
1996,1996,Day 6 of circuit breaker: I have an air freshe...,0,24,-18.815028
1997,1997,"FYI, Starbucks is opened, I guess CBTL is too....",0,26,-18.604685
1998,1998,I'm expecting to see dormitory cases skyrocket...,0,62,-18.520687


In [41]:
comments.to_csv("./Data/Thoughtful Comments/Thoughtful Comments Features.csv")

# Number of verbs

In [45]:
import nltk
from nltk.tokenize import word_tokenize

In [46]:
comments = pd.read_csv("./Data/Thoughtful Comments/Thoughtful Comments Features.csv")

In [47]:
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

In [53]:
num_verbs = []
for i, row in comments.iterrows():
    text = row['Comment']

    text = replace_characters(text)
    text = remove_hashtag_mentions_urls(text)
    text = remove_emoji(text)

    word_tokenizer = RegexpTokenizer(r'[-\'\w]+')
    tokenized_text = word_tokenizer.tokenize(text)

    comment_tags = nltk.pos_tag(tokenized_text)

    count = 0
    for tag in comment_tags:
        if tag[1] in verb_tags:
            count += 1
    
    num_verbs.append(count)

In [58]:
comments['Num verbs'] = num_verbs

In [62]:
comments.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

In [63]:
comments.to_csv("./Data/Thoughtful Comments/Thoughtful Comments Features.csv")

# Number of discourse relations

In [78]:
x = 'although, as though, but, by comparison, even if, even though, however, nevertheless, on the other hand, still, then, though, while, yet, and, meanwhile, in turn, next, ultimately, meantime, also, as if, even as, even still, even then, regardless, when, by contrast, conversely, if, in contrast, instead, nor, or, rather, whereas, while, yet, even after, by contrast, nevertheless, besides, much as, as much as, whereas, neither, nonetheless, even when, on the one hand indeed, finally, in fact, separately, in the end, on the contrary, while, accordingly, additionally, after, also although, and, as, as it, as if besides, but, by comparison, finally, first, for example, for one thing, however, in addition, in fact, in other words, in particular, in response, in sum, in the end, in turn, incidentally, indeed, instead, likewise, meanwhile, nevertheless, on the one hand, on the whole, overall, plus, separately, much as, whereas, ultimately, as though, rather, at the same time, or, then, if, in turn, furthermore, in short, turns out, while, yet, that is, so, what’s more as a matter of fact, further, in return, moreover, similarly, specifically, and, when, typically, as long as, especially if, even if, even when, if, so, when if only, lest,once, only if, only when, particularly if, at least partly because, especially as, especially because, especially since, in large part because, just because, largely because, merely because, not because, not only because, particularly as, particularly because, particularly since, partly because, because, simply because, since, then, after, one day after, reportedly after, consequently, mainly because, for, thus, apparently, in the end, in turn, primarily because, largely as a result, as, because, therefore, only because, particularly, when, so that, thereby, presumably, hence, as a result, if and when, unless, until, in part because, now that, perhaps because, only after, accordingly'

x = x.split(', ')
x = list(set(x))

In [92]:
for i, ele in enumerate(x):
    x[i] = ' ' + ele + ' '

In [98]:
x.sort(key = lambda x: len(x), reverse=True)
x

[' what’s more as a matter of fact ',
 ' at least partly because ',
 ' on the one hand indeed ',
 ' in large part because ',
 ' particularly because ',
 ' largely as a result ',
 ' particularly since ',
 ' especially because ',
 ' on the other hand ',
 ' primarily because ',
 ' at the same time ',
 ' especially since ',
 ' not only because ',
 ' reportedly after ',
 ' on the contrary ',
 ' in part because ',
 ' largely because ',
 ' particularly as ',
 ' perhaps because ',
 ' particularly if ',
 ' on the one hand ',
 ' simply because ',
 ' merely because ',
 ' in other words ',
 ' partly because ',
 ' mainly because ',
 ' in particular ',
 ' for one thing ',
 ' as if besides ',
 ' one day after ',
 ' especially as ',
 ' by comparison ',
 ' especially if ',
 ' also although ',
 ' consequently ',
 ' on the whole ',
 ' when if only ',
 ' particularly ',
 ' incidentally ',
 ' just because ',
 ' only because ',
 ' nevertheless ',
 ' specifically ',
 ' additionally ',
 ' not because ',
 ' fo

In [66]:
comments = pd.read_csv("./Data/Thoughtful Comments/Thoughtful Comments Features.csv")

In [106]:
num_discourse = []
for i, row in comments.iterrows():
    text = row['Comment']

    text = remove_hashtag_mentions_urls(text)
    text = remove_emoji(text)
    text = replace_characters(text)

    count = 0
    for sentence in sent_tokenize(text):
        word_tokenizer = RegexpTokenizer(r'[-\'\w]+')
        tokenized_text = word_tokenizer.tokenize(sentence)
        tokenized_text = [w.lower() for w in tokenized_text]

        text_final = " ".join(tokenized_text)

        for ele in x:
            if ele in text_final:
                count += 1
        
    num_discourse.append(count)

In [108]:
comments['Nume Discourse Relations'] = num_discourse

In [110]:
comments.to_csv("./Data/Thoughtful Comments/Thoughtful Comments Features.csv")