In [None]:
from processing.database import Database
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [None]:
db = Database('../data/database.db')
cmt = db.query('SELECT * FROM comments')

In [None]:
print(cmt.info(verbose = True))
print(cmt.head())

In [None]:
out = cmt.describe().transpose()
out[out['std'] == 0].transpose() # std == 0 means that these are likely useless

In [None]:
print(cmt['num_sentences'].describe())
cmt['num_sentences'].hist(bins = 100)

In [None]:
(cmt[cmt['num_sentences'] >= 3]['num_sentences']).count() / cmt['num_sentences'].count()

In [None]:
v = cmt[cmt['score'] == cmt['score'].min()]
print(v)
print(v['permalink'].values[0])

In [None]:
cmt[cmt['controversiality'] == cmt['controversiality'].max()]['permalink'].values[0]

In [None]:
v = cmt[cmt['num_sentences'] == cmt['num_sentences'].max()]
v['permalink'].values[0]

In [None]:
print(cmt['distinguished'].value_counts())
print(cmt[cmt['author'] != 'AutoModerator']['distinguished'].value_counts())

print(cmt[cmt['distinguished'] == 2])

In [None]:
import stanza
import simpletransformers.classification as cl
import re

p = stanza.Pipeline(lang="en", processors="tokenize")

count = len(cmt['body'])

buf_docid = []
buf_id = []
buf_text = []

for n in range(count):
    if n % 10_000 == 0:
        print(f'\rCompletion = {n / count}', end='')

    if cmt['num_sentences'][n] < 3:
        continue
    
    # note that this gets rid of punctuation
    sentenceDelimiter = re.compile(r'((?:\.|\?|!|\S$)(?:\s|$))', flags = re.MULTILINE)

    one_text = [x.strip() for x in re.split(sentenceDelimiter, cmt['body'][n])]
    if not one_text[-1]:
        del one_text[-1]

    if len(one_text) % 2 == 1:
        raise Exception

    c = int(len(one_text) / 2)
    for t in range(c):
        buf_docid.append(n + 1)
        buf_id.append(t + 1)
        buf_text.append(one_text[2 * t] + one_text[2 * t + 1])

sentences_final = pd.DataFrame({
    'doc_id': buf_docid,
    'id': buf_id,
    'text': buf_text
})

buf_docid = []
buf_id = []
buf_text = []



In [None]:
# Wrangle extracted sentences into sentence trigrams
u_docs = sentences_final["doc_id"].unique()

buf_tri = []
buf_docid = []
buf_id = []

cp = sentences_final['doc_id']
tt = sentences_final['text']

jj = len(cp)

count = len(u_docs)
j = 0
for n in range(count):
    if n % 10_000 == 0:
        print(f'\rCompletion = {n / count}', end='')

    doc_id = u_docs[n]
    text = []
    while j < jj and cp[j] == doc_id:
        text.append(tt[j])
        j += 1

    if len(text) < 3:
        sentence_trigram = [' '.join(text)]
    else:
        # create trigrams
        sentence_trigram = []
        for i in range(len(text) - 2):
            sentence_trigram.append(text[i] + ' ' + text[i + 1] + ' ' + text[i + 2])

    for i in range(len(sentence_trigram)):
        buf_tri.append(sentence_trigram[i])
        buf_id.append(i + 1)
        buf_docid.append(doc_id)

# concatenate all the new data into a final DataFrame
final = pd.DataFrame({
    'sent_trigram': buf_tri,
    'id': buf_id,
    'doc_id': buf_docid,
})

buf_tri = []
buf_docid = []
buf_id = []

In [None]:
print(f"Number of comments: {len(u_docs)}")
print(f'Number of trigrams: {len(final['sent_trigram'])}')

In [None]:
# Load authdetect model and predict (don't use multiprocessing for larger text since it causes stalling)
model = cl.ClassificationModel("roberta",
                               "mmochtak/authdetect", 
                               args={"use_multiprocessing_for_evaluation": True,},
                               use_cuda=False
                               )

# Annotate the prepared trigrams with the authdetect model.
prediction = model.predict(to_predict = final["sent_trigram"].tolist())

anno_df = final.assign(predict = prediction[1])

scores = prediction[1] if isinstance(prediction, tuple) and len(prediction) >= 2 else prediction

anno_df_speech = anno_df.groupby('doc_id').agg(
    demo=('predict', lambda x: x.mean()),
    auth=('predict', lambda x: 1 - x.mean()),
    auth_sent=('predict', lambda x: (x <= 0.5).mean()),
    num_sent=('predict', lambda x: x.size),
    predict_std=('predict', lambda x: x.std(ddof=0)),
).reset_index()

In [None]:
anno_df_speech['auth'].hist(bins=100)

In [None]:
dd = []
for i in range(len(anno_df_speech['doc_id'])):
    doc_id = int(anno_df_speech['doc_id'][i])
    row = cmt.loc[doc_id - 1, :]
    dd.append(row['id'])

anno_df_speech = anno_df_speech.assign(cmt_id = dd)


In [None]:
import sqlite3

out = {
    'anno_df.db': anno_df,
    'anno_df_speech.db': anno_df_speech
}

for file, data in out.items():
    open(file, 'w').close()
    con = sqlite3.connect(file)
    data.to_sql('main', con)

In [4]:
import sqlite3
import pandas as pd
from processing.database import Database

con = sqlite3.connect('anno_df_speech.db')
con.cursor()

anno_df_speech = pd.read_sql('SELECT * FROM main;', con)

cmt = Database('../data/database.db').query('SELECT * FROM comments')

              index        doc_id          demo          auth     auth_sent  \
count  25431.000000  25431.000000  25431.000000  25431.000000  25431.000000   
mean   12715.000000  50005.606386      0.652021      0.347979      0.133679   
std     7341.441684  28996.542979      0.125004      0.125004      0.293252   
min        0.000000      1.000000      0.115170      0.067089      0.000000   
25%     6357.500000  24780.500000      0.588456      0.259382      0.000000   
50%    12715.000000  50026.000000      0.673541      0.326459      0.000000   
75%    19072.500000  75004.000000      0.740618      0.411544      0.000000   
max    25430.000000  99996.000000      0.932911      0.884830      1.000000   

           num_sent   predict_std  
count  25431.000000  25431.000000  
mean       3.703197      0.038484  
std        5.243605      0.047764  
min        1.000000      0.000000  
25%        1.000000      0.000000  
50%        2.000000      0.021162  
75%        4.000000      0.062102  


In [5]:
print(anno_df_speech.describe())
m = anno_df_speech['auth'].min()
v = anno_df_speech[anno_df_speech['auth'] == m]
print(v)
most = cmt[cmt['id'] == v['cmt_id'].values[0]]
print(most)
print(most['body'].values[0])
print(most['permalink'].values[0])

              index        doc_id          demo          auth     auth_sent  \
count  25431.000000  25431.000000  25431.000000  25431.000000  25431.000000   
mean   12715.000000  50005.606386      0.652021      0.347979      0.133679   
std     7341.441684  28996.542979      0.125004      0.125004      0.293252   
min        0.000000      1.000000      0.115170      0.067089      0.000000   
25%     6357.500000  24780.500000      0.588456      0.259382      0.000000   
50%    12715.000000  50026.000000      0.673541      0.326459      0.000000   
75%    19072.500000  75004.000000      0.740618      0.411544      0.000000   
max    25430.000000  99996.000000      0.932911      0.884830      1.000000   

           num_sent   predict_std  
count  25431.000000  25431.000000  
mean       3.703197      0.038484  
std        5.243605      0.047764  
min        1.000000      0.000000  
25%        1.000000      0.000000  
50%        2.000000      0.021162  
75%        4.000000      0.062102  
