In [None]:
import os
import pandas as pd

new_path = os.path.split(os.getcwd())[0]

import json
import re

from emoji import demojize
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

### convert monthly api output (.jsonl) to dataframes (.pkl)

In [None]:
re_total = re.compile('[Bb][Oo][Tt]')
re_mentions = re.compile('@{1}[a-zA-Z0-9_]*[Bb][Oo][Tt][a-zA-Z0-9_]*')

re_sub_mentions = re.compile('\s{0,1}@{1}[a-zA-Z0-9_]+\s{0,1}')
re_sub_urls = re.compile('\s{0,1}https://t.co/[a-zA-Z0-9]+\s{0,1}')

re_bot_lax = re.compile('you are .*bot|you\'re .*bot')
re_bot_strict = re.compile('you are a [a-z]*bot|you\'re a [a-z]*bot')

def bot_in_text_t2(row):
    bot_total = re_total.findall(row['t2_text']) # total occurrences of 'bot' in t2_text 
    bot_mentions = re_mentions.findall(row['t2_text']) # occurrences of 'bot' in t2_text mentions
    if not bot_total: # 'bot' not in t2_text at all
        return (0,0)
    else:
        if len(bot_total) == len(bot_mentions): # 'bot' only in mentions
            return (0,1)
        else:
            if len(bot_mentions) == 0: # 'bot' only in text, not in mentions
                return (1,0)
            else: # 'bot' both in text and in mentions
                return (1,1)
            
def get_monologue(row):
    return 1 if row['t1_author_id'] == row['t2_author_id'] else 0

def clean_text_t1(row):
    try:
        text = re_sub_mentions.sub(' ',row['t1_text'])
        text = re_sub_urls.sub(' ',text)
        return text.strip()
    except:
        return None
    
def clean_text_t2(row):
    try:
        text = re_sub_mentions.sub(' ',row['t2_text'])
        text = re_sub_urls.sub(' ',text)
        return text.strip()
    except:
        return None

def match_bot_pattern_lax_t1(row):
    if row['t1_text_cleaned'] == None:
        return 0
    elif re_bot_lax.findall(row['t1_text_cleaned'].lower()):
        return 1
    else:
        return 0

def match_bot_pattern_strict_t1(row):
    if row['t1_text_cleaned'] == None:
        return 0
    elif re_bot_strict.findall(row['t1_text_cleaned'].lower()):
        return 1
    else:
        return 0
    
def match_bot_pattern_lax_t2(row):
    if row['t2_text_cleaned'] == None:
        return 0
    elif re_bot_lax.findall(row['t2_text_cleaned'].lower()):
        return 1
    else:
        return 0
    
def match_bot_pattern_strict_t2(row):
    if row['t2_text_cleaned'] == None:
        return 0
    elif re_bot_strict.findall(row['t2_text_cleaned'].lower()):
        return 1
    else:
        return 0
    
def load_and_process(filename):
    
    tweets = []
    with open(filename,'r') as file: 
        for line in file:
            try:
                tweets.append(json.loads(line))
            except:
                pass
    
    tweet_fields = ['conversation_id','id','created_at','geo','in_reply_to_user_id','public_metrics','source','text']
    user_fields = ['created_at','description','id','location','name','profile_image_url','public_metrics','url','username','verified']
    fields = []

    for tweet in tweets:

        fields_ = []

        if tweet['t1'] != None:
            tf_1 = [tweet['t1'][tf] if (tf in tweet['t1'].keys()) else None for tf in tweet_fields]
        else:
            tf_1 = [None for i in range(len(tweet_fields))]
        fields_.extend(tf_1)

        if tweet['t2'] != None:
            tf_2 = [tweet['t2'][tf] if (tf in tweet['t2'].keys()) else None for tf in tweet_fields]
        else:
            tf_2 = [None for i in range(len(tweet_fields))]
        fields_.extend(tf_2)

        if tweet['t1_author'] != None:
            uf_1 = [tweet['t1_author'][uf] if (uf in tweet['t1_author'].keys()) else None for uf in user_fields]
        else:
            uf_1 = [None for i in range(len(user_fields))]
        fields_.extend(uf_1)

        if tweet['t2_author'] != None:
            uf_2 = [tweet['t2_author'][uf] if (uf in tweet['t2_author'].keys()) else None for uf in user_fields]
        else:
            uf_2 = [None for i in range(len(user_fields))]
        fields_.extend(uf_2)

        fields.append(fields_)

    pd_tweets = pd.DataFrame(fields, columns=['t1_'+tf for tf in tweet_fields]+['t2_'+tf for tf in tweet_fields]+['t1_author_'+uf for uf in user_fields]+['t2_author_'+uf for uf in user_fields])
    pd_tweets = pd_tweets[~pd_tweets['t2_id'].isnull()]
    pd_tweets = pd_tweets.drop_duplicates('t2_id')
    pd_tweets = pd_tweets.replace('',None)
    pd_tweets = pd_tweets.sort_values('t2_created_at')
    pd_tweets = pd_tweets.reset_index(drop=True)
    
    return pd_tweets

def augment(pd_tweets):
    pd_tweets['filter_dummy'] = pd_tweets.apply(bot_in_text_t2, axis=1)
    pd_tweets['filter_bot_in_text'] = [t[0] for t in pd_tweets['filter_dummy']]
    pd_tweets['filter_bot_in_mentions'] = [t[1] for t in pd_tweets['filter_dummy']]
    pd_tweets = pd_tweets.drop('filter_dummy', axis=1)
    pd_tweets['filter_monologue'] = pd_tweets.apply(get_monologue, axis=1)
    pd_tweets['t1_text_cleaned'] = pd_tweets.apply(clean_text_t1, axis=1)
    pd_tweets['t2_text_cleaned'] = pd_tweets.apply(clean_text_t2, axis=1)

    pd_tweets['filter_t1_bot_pattern_lax'] = pd_tweets.apply(match_bot_pattern_lax_t1, axis=1)
    pd_tweets['filter_t1_bot_pattern_strict'] = pd_tweets.apply(match_bot_pattern_strict_t1, axis=1)
    pd_tweets['filter_t2_bot_pattern_lax'] = pd_tweets.apply(match_bot_pattern_lax_t2, axis=1)
    pd_tweets['filter_t2_bot_pattern_strict'] = pd_tweets.apply(match_bot_pattern_strict_t2, axis=1)

    pd_tweets['t1_datetime'] = pd.to_datetime(pd_tweets['t1_created_at'])
    pd_tweets['t2_datetime'] = pd.to_datetime(pd_tweets['t2_created_at'])

    pd_tweets['t1_year'] = [t.year for t in pd_tweets['t1_datetime']]
    pd_tweets['t1_month'] = [t.month for t in pd_tweets['t1_datetime']]
    pd_tweets['t2_year'] = [t.year for t in pd_tweets['t2_datetime']]
    pd_tweets['t2_month'] = [t.month for t in pd_tweets['t2_datetime']]
    return pd_tweets

conversion and augmentation of monthly tweet files from .jsonl to .pkl

In [None]:
for year in os.listdir(os.path.join(new_path,'_data','_raw')):
    print(year)
    if year.startswith('.'):
          continue
    for month in [i+1 for i in range(12)]:
        if os.path.exists(os.path.join(new_path,'_data','_raw',year,f'{year}_{str(month).zfill(2)}.pkl')):
            continue
        elif not os.path.exists(os.path.join(new_path,'_data','_raw',year,f'{year}_{str(month).zfill(2)}.jsonl')):
            continue
        else:
            pd_tweets = load_and_process(os.path.join(new_path,'_data','_raw',year,f'{year}_{str(month).zfill(2)}.jsonl'))
            pd_tweets = augment(pd_tweets)
            pd_tweets.to_pickle(os.path.join(new_path,'_data','_raw',year,f'{year}_{str(month).zfill(2)}.pkl'))
            print(f'Created .pkl for {year}_{str(month).zfill(2)}')

### assemble monthly dataframes into datasets

#### bot_direct

In [None]:
bot_direct = pd.DataFrame()

for year in os.listdir(os.path.join(new_path,'_data','_raw')):
    if year.startswith('.'):
        continue
    for month in [i+1 for i in range(12)]:
        if os.path.exists(os.path.join(new_path,'_data','_raw',year,f'{year}_{str(month).zfill(2)}.pkl')):
            pd_month = pd.read_pickle(os.path.join(new_path,'_data','_raw',year,f'{year}_{str(month).zfill(2)}.pkl'))
            pd_month_filtered = pd_month[(pd_month['filter_monologue']==0)&(pd_month['filter_t2_bot_pattern_strict']==1)&(pd_month['filter_bot_in_text']==1)]
            bot_direct = pd.concat([bot_direct,pd_month_filtered])
            n_total, n_bot_in_text, n_bot_pattern_lax, n_bot_pattern_strict = len(pd_month), len(pd_month[pd_month['filter_bot_in_text']==1]), len(pd_month[(pd_month['filter_monologue']==0)&(pd_month['filter_t2_bot_pattern_lax']==1)&(pd_month['filter_bot_in_text']==1)]), len(pd_month[(pd_month['filter_monologue']==0)&(pd_month['filter_t2_bot_pattern_strict']==1)&(pd_month['filter_bot_in_text']==1)])
            with open(os.path.join(new_path,'accusations_overview.txt'), 'a') as f:
                f.write(f'{year[1:]};{month};{n_total};{n_bot_in_text};{n_bot_pattern_lax};{n_bot_pattern_strict}\n')
                f.close()
            print(f'Processed {year[1:]} {month}')
            
bot_direct = bot_direct.sort_values('t2_datetime', ascending=True)
bot_direct = bot_direct.reset_index(drop=True)
bot_direct.to_pickle(os.path.join(new_path,'_data','bot_direct.pkl'))

In [None]:
len(bot_direct)

#### bot_all

In [None]:
bot_all = pd.DataFrame()

for year in os.listdir(os.path.join(new_path,'_data','_raw')):
    if year.startswith('.'):
        continue
    for month in [i+1 for i in range(12)]:
        if os.path.exists(os.path.join(new_path,'_data','_raw',year,f'{year}_{str(month).zfill(2)}.pkl')):
            pd_month = pd.read_pickle(os.path.join(new_path,'_data','_raw',year,f'{year}_{str(month).zfill(2)}.pkl'))
            pd_month_bottexts = pd_month[(pd_month['filter_bot_in_text']==1)]
            pd_month_bottexts = pd_month_bottexts[['t1_id','t1_author_id','t1_author_username','t1_datetime','t1_text','t1_text_cleaned']+['t2_id','t2_author_id','t2_author_username','t2_datetime','t2_text','t2_text_cleaned']]
            pd_month_bottexts = pd_month_bottexts.astype({'t1_author_id': 'category', 't1_author_username': 'category', 't2_id': int, 't2_author_id': 'category','t2_author_username': 'category'})
            bot_all = pd.concat([bot_all,pd_month_bottexts])
            print(f'Processed {year[1:]} {month}')
            
bot_all = bot_all.sort_values('t2_datetime', ascending=True)
bot_all = bot_all.reset_index(drop=True)
bot_all.to_pickle(os.path.join(new_path,'_data','bot_all.pkl'))

In [None]:
len(bot_all)

### preprocess bot_all for classifier training and deployment

In [None]:
def clean_text(row):
    re_mention = re.compile('\s{0,1}@{1}[a-zA-Z0-9_]+\s{0,1}')
    text = row['t2_text']
    t1_user = row['t1_author_username']
    if t1_user == None:
        text = re_mention.sub('@other_user ',text)
        return 'To @unknown_user: ' + text
    else:
        text = text.replace('@'+t1_user,'t1_user ')
        text = re_mention.sub('@other_user ',text)
        text = text.replace('t1_user','@t1_user')
        return 'To @t1_user: ' + text
    
def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

def normalizeTweet(row):
    tweet = row['t2_text']
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())

def normalizeTweet_t1(row):
    try:
        tweet = row['t1_text']
        tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
        normTweet = " ".join([normalizeToken(token) for token in tokens])

        normTweet = (
            normTweet.replace("cannot ", "can not ")
            .replace("n't ", " n't ")
            .replace("n 't ", " n't ")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
        )
        normTweet = (
            normTweet.replace("'m ", " 'm ")
            .replace("'re ", " 're ")
            .replace("'s ", " 's ")
            .replace("'ll ", " 'll ")
            .replace("'d ", " 'd ")
            .replace("'ve ", " 've ")
        )
        normTweet = (
            normTweet.replace(" p . m .", "  p.m.")
            .replace(" p . m ", " p.m ")
            .replace(" a . m .", " a.m.")
            .replace(" a . m ", " a.m ")
        )

        return " ".join(normTweet.split())
    except:
        return None

In [None]:
bot_all = pd.read_pickle(os.path.join(new_path,'_data','bot_all.pkl'))

In [None]:
bot_all['t2_text_processed'] = bot_all.apply(normalizeTweet, axis=1)

In [None]:
bot_all.to_pickle(os.path.join(new_path,'_data','bot_all.pkl'))

### deploy accusation classifier to bot_all

load final accusation classification model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(new_path,'_model','accusation_classification_model'))
tokenizer = AutoTokenizer.from_pretrained(os.path.join(new_path,'_model','tokenizer'), model_max_length=512)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device='cuda:0')

run model on bot_all
- 6min 41s for 100,000 instances
- ~24h 45min for 22,275,139 instances

In [None]:
bot_all = pd.read_pickle(os.path.join(new_path,'_data','bot_all.pkl'))

In [None]:
model_predictions = pipe(list(bot_all['t2_text_processed']), batch_size=256, **{'truncation':True,'max_length':128})

In [None]:
bot_all['predicted_label'] = [1 if x['label']=='LABEL_1' else 0 for x in model_predictions]

In [None]:
bot_all = bot_all.astype({'t1_author_username': 'category', 't2_id': int, 't2_author_username': 'category', 'predicted_label': 'category'})

In [None]:
bot_all.to_pickle(os.path.join(new_path,'_data','bot_all.pkl'))

### extract bot_general from bot_all

In [None]:
bot_all = pd.read_pickle(os.path.join(new_path,'_data','bot_all.pkl'))

In [None]:
bot_general = bot_all[bot_all['predicted_label']==1]
bot_general = bot_general.reset_index(drop=True)

In [None]:
len(bot_general)

In [None]:
bot_general.to_pickle(os.path.join(new_path,'_data','bot_general.pkl'))

### augment bot_direct with info on accusers and accusations

In [None]:
def get_n_accusations(pd_):
    n_accusations = [[id_,n_] for id_,n_ in pd_.groupby('t1_author_id').size().items()]
    pd_accusations = pd.DataFrame(n_accusations, columns=['t1_author_id','n_accusations'])
    pd_ = pd_.merge(pd_accusations, on='t1_author_id', how='left')
    return pd_

def get_n_accusers(pd_):
    pd_lite = pd_[['t1_author_id','t2_author_id']]
    def get_accusers(row):
        return len(set(list(pd_lite[pd_lite['t1_author_id']==row['t1_author_id']]['t2_author_id'])))
    pd_lite['n_accusers'] = pd_lite.apply(get_accusers, axis=1)
    pd_lite = pd_lite.drop('t2_author_id', axis=1)
    pd_['n_accusers'] = pd_lite['n_accusers']
    return pd_

def get_n_accusing(pd_):
    n_accusing = [[id_,n_] for id_,n_ in pd_.groupby('t2_author_id').size().items()]
    pd_temp = pd.DataFrame(n_accusing, columns=['t2_author_id','n_accusing'])
    pd_ = pd_.merge(pd_temp, on='t2_author_id', how='left')
    return pd_

def get_n_accusees(pd_):
    pd_lite = pd_[['t1_author_id','t2_author_id']]

    filter_1 = pd_[pd_['n_accusing']>1]
    filter_2 = filter_1.drop_duplicates('t1_author_id')
    query_list = filter_2[['t1_author_id','t2_author_id']]
    query_list = query_list.drop_duplicates('t2_author_id')

    def get_accusees(row):
        return len(set(list(pd_lite[(pd_lite['t2_author_id']==row['t2_author_id'])&(~pd_lite['t1_author_id'].isnull())]['t1_author_id'])))

    query_list['n_accusees'] = query_list.apply(get_accusees, axis=1)
    query_list = query_list.drop('t1_author_id', axis=1)
    pd_ = pd_.merge(query_list, how='left', on='t2_author_id')
    return pd_

In [None]:
bot_direct = pd.read_pickle(os.path.join(new_path,'_data','bot_direct.pkl'))

In [None]:
bot_direct = get_n_accusations(bot_direct)
bot_direct = get_n_accusers(bot_direct)
bot_direct = get_n_accusing(bot_direct)
bot_direct = get_n_accusees(bot_direct)

In [None]:
bot_direct.to_pickle(os.path.join(new_path,'_data','bot_direct.pkl'))