In [1]:
import os
import pandas as pd

new_path = os.path.split(os.getcwd())[0]

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
from contextlib import redirect_stdout

### word embeddings over time

In [2]:
bot_general = pd.read_pickle(os.path.join(new_path,'_data','bot_general.pkl'))

In [3]:
datetime_t2 =pd.to_datetime(bot_general['t2_datetime'])
bot_general['year'] = datetime_t2.dt.year

for i in range(9): # aggregate all tweets from 2007 to 2016 into a single class
    bot_general['year'] = bot_general['year'].replace(2007+i, 2016)

train 3 word embeddings per year; extract most similar tokens to term 'bot'

In [4]:
for year in [2016, 2017, 2018, 2019, 2020, 2021, 2022]:
    subset = bot_general[bot_general['year']==year]
    train_data = [list(tokenize(x.lower())) for x in subset['t2_text_processed']]
    erg = []
    for i in range(3):
        model = None
        model = Word2Vec(vector_size=200, window=5, min_count=3, workers=20, sg=0)
        model.build_vocab(train_data)
        model.train(train_data, total_examples=model.corpus_count, epochs=10)
        model.save(os.path.join(new_path,'_model','w2v_models',f'w2vmodel_{year}_run_{i+1}.model'))
        print(f'{year} run: {i+1} - model saved')
        erg.append([x[0] for x in model.wv.most_similar(positive=['bot'], topn=30) if x[0] not in stopwords.words()])
    print(list(set(erg[0]) & set(erg[1])& set(erg[2])))

2016 run: 1 - model saved
2016 run: 2 - model saved
2016 run: 3 - model saved
['user', 'guy', 'machine', 'tool', 'acct', 'script', 'database', 'software', 'comments', 'company', 'person']
2017 run: 1 - model saved
2017 run: 2 - model saved
2017 run: 3 - model saved
['moron', 'account', 'paid', 'troll', 'tool', 'idiot', 'real', 'human', 'product', 'person', 'probably']
2018 run: 1 - model saved
2018 run: 2 - model saved
2018 run: 3 - model saved
['moron', 'account', 'paid', 'troll', 'tool', 'supporter', 'idiot', 'fool', 'human', 'russian', 'shill', 'product', 'person', 'joke', 'probably']
2019 run: 1 - model saved
2019 run: 2 - model saved
2019 run: 3 - model saved
['moron', 'account', 'troll', 'tool', 'supporter', 'idiot', 'fool', 'product', 'human', 'parody', 'asset', 'russian', 'probably', 'person', 'joke', 'trumpster', 'caricature']
2020 run: 1 - model saved
2020 run: 2 - model saved
2020 run: 3 - model saved
['user', 'foreigner', 'moron', 'hacker', 'teenager', 'troll', 'tool', 'sup

collect results

In [5]:
results = {}

years = [2016, 2017, 2018, 2019, 2020, 2021, 2022]
for year in years:
    erg = []
    for i in range(3):
        model = Word2Vec.load(os.path.join(new_path,'_model','w2v_models',f'w2vmodel_{year}_run_{i+1}.model'))
        erg.append([x[0] for x in model.wv.most_similar(positive=['bot'], topn=30) if x[0] not in stopwords.words()])
    results[year]=list(set(erg[0]) & set(erg[1])& set(erg[2]))

In [6]:
print(results)

{2016: ['user', 'guy', 'program', 'machine', 'software', 'script', 'hashtags', 'acct', 'database', 'comments', 'person'], 2017: ['moron', 'account', 'paid', 'troll', 'tool', 'entity', 'idiot', 'real', 'human', 'probably', 'person'], 2018: ['writer', 'moron', 'account', 'paid', 'troll', 'tool', 'entity', 'idiot', 'supporter', 'fool', 'human', 'dolt', 'russian', 'shill', 'probably', 'person', 'joke'], 2019: ['idiot', 'parody', 'person', 'joke', 'tool', 'asset', 'moron', 'robot', 'account', 'human', 'probably', 'caricature', 'troll', 'supporter', 'fool', 'definitely', 'russian', 'product', 'stooge'], 2020: ['user', 'robot', 'foreigner', 'moron', 'teenager', 'troll', 'tool', 'supporter', 'idiot', 'trumper', 'fool', 'human', 'parody', 'russian', 'probably', 'person', 'joke'], 2021: ['paid', 'idiot', 'real', 'parody', 'shill', 'person', 'joke', 'chinese', 'foreigner', 'tool', 'moron', 'robot', 'account', 'human', 'probably', 'troll', 'fool', 'definitely', 'russian'], 2022: ['idiot', 'parody'

generate latex-table from results

In [7]:
political_terms = ['software', 'program', 'account','acct','comments','script','machine','hashtags']
dehum_terms = ['moron','stupid', 'idiot','shill','dolt','fool','foreigner','russian', 'trumper','supporter','chinese','simpleton','stooge','propagandist']

## now we create the table
table_string = '\\begin{table}[]\n\
\\caption{Nearest embedding vectors to the term \textit{bot} over the years. We highlight terms associated with mechanics for automation in blue and dehumanizing/insulting/political terms in red.}\
\\begin{tabular}{ll}\n\
\n'

for year in results.keys():

    row_year  = '2007-2016&' if year==2016 else str(year)+'&'
    row_terms = ''
    for idx, x in enumerate(results[year]):
        #
        if (idx+1) % 6 == 0:
            row_year+='\\\ \n'+row_year
        if x not in political_terms and x not in dehum_terms:
            row_year+=x+', '
        if x in dehum_terms:
            row_year+='{\\color{red}\\colorbox{pink}{\\vphantom{pd}'+x+'}}, '
        if x in political_terms:
            row_year+='{\\color{blue}\\colorbox{blue!20}{\\vphantom{pd}'+x+'}}, '
        
    row_terms=row_terms + ' \\\ \n \hline' 

    row = row_year + row_terms
    table_string+=row

table_string+='\end{tabular}\n\
\label{fig:embeddings}\n\
\end{table}'
print(table_string)

with open(os.path.join(new_path,'_results','tab2_embeddings.txt'), 'w') as f:
    with redirect_stdout(f):
        print(table_string)

\begin{table}[]
\caption{Nearest embedding vectors to the term 	extit{bot} over the years. We highlight terms associated with mechanics for automation in blue and dehumanizing/insulting/political terms in red.}\begin{tabular}{ll}

2007-2016&user, guy, {\color{blue}\colorbox{blue!20}{\vphantom{pd}program}}, {\color{blue}\colorbox{blue!20}{\vphantom{pd}machine}}, {\color{blue}\colorbox{blue!20}{\vphantom{pd}software}}, \\ 
2007-2016&user, guy, {\color{blue}\colorbox{blue!20}{\vphantom{pd}program}}, {\color{blue}\colorbox{blue!20}{\vphantom{pd}machine}}, {\color{blue}\colorbox{blue!20}{\vphantom{pd}software}}, {\color{blue}\colorbox{blue!20}{\vphantom{pd}script}}, {\color{blue}\colorbox{blue!20}{\vphantom{pd}hashtags}}, {\color{blue}\colorbox{blue!20}{\vphantom{pd}acct}}, database, {\color{blue}\colorbox{blue!20}{\vphantom{pd}comments}}, person,  \\ 
 \hline2017&{\color{red}\colorbox{pink}{\vphantom{pd}moron}}, {\color{blue}\colorbox{blue!20}{\vphantom{pd}account}}, paid, troll, tool, \\ 