In [2]:
from convokit import Corpus
import pandas as pd

In [3]:
corpus = Corpus(filename='long-reddit-corpus')

In [4]:
from convokit import TextCleaner

In [7]:
from cleantext import clean

In [8]:
clean_str = lambda s: clean(s,
                            fix_unicode=True,               # fix various unicode errors
                            to_ascii=True,                  # transliterate to closest ASCII representation
                            lower=True,                     # lowercase text
                            no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                            no_urls=True,                  # replace all URLs with a special token
                            no_emails=True,                # replace all email addresses with a special token
                            no_phone_numbers=True,         # replace all phone numbers with a special token
                            no_numbers=False,               # replace all numbers with a special token
                            no_digits=False,                # replace all digits with a special token
                            no_currency_symbols=True,      # replace all currency symbols with a special token
                            no_punct=False,                 # fully remove punctuation
                            replace_with_url="<URL>",
                            replace_with_email="<EMAIL>",
                            replace_with_phone_number="<PHONE>",
                            replace_with_number="<NUMBER>",
                            replace_with_digit="0",
                            replace_with_currency_symbol="<CUR>",
                            lang="en"
                            )

In [9]:
tc = TextCleaner(text_cleaner=clean_str, verbosity=100000)

In [10]:
tc.transform(corpus)

100000/1085877 utterances processed
200000/1085877 utterances processed
300000/1085877 utterances processed
400000/1085877 utterances processed
500000/1085877 utterances processed
600000/1085877 utterances processed
700000/1085877 utterances processed
800000/1085877 utterances processed
900000/1085877 utterances processed
1000000/1085877 utterances processed
1085877/1085877 utterances processed


<convokit.model.corpus.Corpus at 0x1284fa990>

In [11]:
utt = corpus.random_utterance()

In [12]:
utt.text

"if you haven't listened to breaking all illusions off of adtoe you should check it out. one of the best songs they've ever released i agree that there's a lot of 'filler' from their recent albums though"

In [13]:
utt.meta['original']

"if you haven't listened to breaking all illusions off of ADTOE you should check it out. one of the best songs they've ever released\n\ni agree that there's a lot of 'filler' from their recent albums though"

In [19]:
utt_ids = []
utt_texts = []

for convo in corpus.iter_conversations():
    for utt in convo.get_chronological_utterance_list()[:20]:
        utt_ids.append(utt.id)
        utt_texts.append(utt.text)

In [20]:
len(utt_ids)

589740

In [21]:
df = pd.DataFrame([utt_ids, utt_texts])

In [22]:
df = df.T

In [23]:
df['id'] = df[0]
df['text'] = df[1]
del df[0]
del df[1]

In [24]:
df.to_excel('utts.xlsx')

after liwc processing

In [26]:
df = pd.read_excel('utts_liwc.xlsx')

In [30]:
df = df.set_index('Source (B)')

In [32]:
del df['Source (A)']

In [38]:
del df['Source (C)']

In [61]:
for r in df.iterrows():
    utt_id = r[0]
    feats = r[1].to_dict()
    corpus.get_utterance(utt_id).meta['liwc'] = feats

In [66]:
corpus.dump('long-reddit-corpus-liwc', base_path="./")

In [65]:
os.listdir()

['.DS_Store',
 'utts.xlsx',
 'liwc_annotation.ipynb',
 'long-reddit-corpus',
 'quick_stats.ipynb',
 '.ipynb_checkpoints',
 'utts_liwc.xlsx',
 '~$utts_liwc.xlsx']

In [58]:
corpus.get_utterance('dnxhx8v').text

'do yourself a favor and make it the last time you step into "any" church. great story though, goes to show the insincerity and true nature of these so called church leaders.'

In [56]:
r[0]

'dnxhx8v'

In [44]:
for r in df.iterrows():
    print(r)
    break

('dnxhx8v', WC              32.00
Analytic        64.83
Clout           96.95
Authentic       35.37
Tone            99.00
WPS             16.00
Sixltr           9.38
Dic             96.88
function        46.88
pronoun         12.50
ppron            6.25
i                0.00
we               0.00
you              6.25
shehe            0.00
they             0.00
ipron            6.25
article          9.38
prep             9.38
auxverb          3.12
adverb           6.25
conj            12.50
negate           0.00
verb            12.50
adj              6.25
compare          0.00
interrog         0.00
number           0.00
quant            3.12
affect          12.50
                ...  
focuspresent     9.38
focusfuture      0.00
relativ         15.62
motion           6.25
space            3.12
time             6.25
work             3.12
leisure          0.00
home             0.00
money            0.00
relig            6.25
death            0.00
informal         0.00
swear            0.0