# Phase 5. Analysis of political social bots

## Contents
- [Configuration](#Configuration)
  - [Imports](#Imports)
  - [Variables](#Variables)
  - [Support functions](#Support-functions)
- [Cumulative bots' appearance](#Cumulative-bots'-appearance)
  - [Appearence Graphs](#Appearence-Graphs)
- [Bots daily activities](#Bots-daily-activities)
  - [Active bots per day](#Active-bots-per-day)
  - [Bots' activity windows](#Bots'-activity-windows)
  - [Bots daily activity with cumulative metric](#Bots-daily-activity-with-cumulative-metric)
- [Generated traffic](#Generated-traffic)
  - [Bot's traffic volumes per political party](#Bot's-traffic-volumes-per-political-party)
- [Sentiment analysis towards party themes](#Sentiment-analysis-towards-party-themes)
  - [Sentiment score per political party](#Sentiment-score-per-political-party)
  - [Sentiment score per political party (boxplot)](#Sentiment-score-per-political-party-(boxplot))

## Configuration

### Imports

In [None]:
# Utilities
from IPython.display import display
from fastprogress import master_bar, progress_bar
import os
from datetime import datetime
from datetime import timedelta
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import csv
from collections.abc import MutableMapping

# Graphics
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.dates import date2num


# DB management
from pymongo import MongoClient

### Variables

In [None]:
# If true exports vectorial PDFs instead of JPG.
VECTORIAL_FIGURES = True
FIG_EXTENSION = ".pdf" if VECTORIAL_FIGURES else ".jpg"

# Directories where CSV data is stored
ROOT_DIR = "ABOSLUTE_PATH_TO_ROOT_FOLDER"
DATA_DIR = ROOT_DIR + "data/"
GRAPHICS_DIR = ROOT_DIR + "graphics/"

# Change path to root
os.chdir(ROOT_DIR)

try:
    os.makedirs(GRAPHICS_DIR)
except FileExistsError:
    # directory already exists
    pass

# MongoDB parameters
mongoclient = MongoClient('IP_ADDRESS', PORT)
db = mongoclient.DATABASE_NAME

### Support Functions

In [None]:
def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    if text is None:
        return None
    text = str(text)
    if not text.strip():
        return None
    return ObjectId(text.rjust(24,"0"))


def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

## Cumulative bots' appearance

In [None]:
def get_first_appearances(user_collection):
    """Extracts a Dataframe with the dates of creation of the first tweet of each bot and associated political party.
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    
    p95 = 0.6908019160064479
    
    pipeline = [
    {
        '$match': {
            'scores.scores.universal': {
                '$gte':p95
            }
        }
    }, 
        {
        '$lookup': {
            'from': 'tweets', 
            'localField': '_id', 
            'foreignField': 'user_id', 
            'as': 'tweets'
        }
    }, {
        '$unwind': {
            'path': '$tweets'
        }
    }, {
        '$sort': {
            'tweets.created_at': 1
        }
    }, {
        '$group': {
            '_id': '$_id', 
            'created_at': {
                '$first': '$tweets.created_at'
            }, 
            'bot_political_party': {
                '$first': '$bot_political_party'
            }
        }
    }
    ]
    
    print("Query", end=" ")
    bot_appearances = user_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    bot_appearances = list(bot_appearances)
    print("OK; Total records:", len(bot_appearances))
    return pd.DataFrame(bot_appearances)

In [None]:
%%time
df = get_first_appearances(db.users)

In [None]:
df.bot_political_party.fillna('Unknown',inplace=True)

### Appearence Graphs

Generate three graphs:
1. all bots, 
2. bots with two political parties, 
3. or bots with single political party (paper version)

In [None]:
def preprocess_dataframe(data,version):
    """Preprocesses and filters dataframe according to version of appearance. 
    
    Keyword arguments:
    data -- DataFrame with dates of creation of the first tweet of each bot and associated political party.
    version -- possible values: 'all' (no filtering, political parties are classified as 'Unknown', 'Unclear', 'Two-party affinity' or 'One-party affinity','two parties')
                                'two parties' (filters only bots with two-party affinity, grouping partyA-partyB and partyB-partyA in the same group)
                                'one party' (filters only bots with one-party affinity)
    """
    if version == 'all':
        data['bot_political_party'] = df.bot_political_party.replace({'Unknown':'Unknown', # bots not labeled: insufficient information
                                                'Unclear':'Unclear', # bots labeled as unclear: not able to categorize in one or two parties
                                                # bots labeled with two parties
                                                'VOX-Ciudadanos':'Two-party affinity', 
                                                'Ciudadanos-VOX':'Two-party affinity', 
                                                'PSOE-UP':'Two-party affinity',
                                                'UP-PSOE':'Two-party affinity',
                                                'VOX-UP':'Two-party affinity',
                                                'UP-VOX':'Two-party affinity',        
                                                'UP-Ciudadanos':'Two-party affinity',
                                                'Ciudadanos-UP':'Two-party affinity',
                                                'VOX-PP':'Two-party affinity',
                                                'PP-VOX':'Two-party affinity',
                                                'UP-PP':'Two-party affinity',
                                                'PP-UP':'Two-party affinity',
                                                'Ciudadanos-PSOE':'Two-party affinity',
                                                'PSOE-Ciudadanos':'Two-party affinity',
                                                'Ciudadanos-PP':'Two-party affinity',
                                                'PP-Ciudadanos':'Two-party affinity',
                                                'PSOE-VOX':'Two-party affinity',
                                                'VOX-PSOE':'Two-party affinity', 
                                                'PSOE-PP':'Two-party affinity',
                                                'PP-PSOE':'Two-party affinity',
                                                # bots labelled with two parties,
                                                'Ciudadanos':'One-party affinity',
                                                'VOX':'One-party affinity',
                                                'UP':'One-party affinity',
                                                'PSOE':'One-party affinity',
                                                'PP':'One-party affinity',
                                            })
        
    elif version == 'two_parties':
        data = data[data.bot_political_party.isin(['UP-VOX', 'Ciudadanos-UP','UP-Ciudadanos', 'VOX-Ciudadanos', 'PSOE-UP', 'UP-PP',
                                                   'VOX-UP', 'Ciudadanos-VOX', 'Ciudadanos-PSOE', 'PP-UP', 'PP-Ciudadanos', 'VOX-PP',
                                                   'UP-PSOE', 'PSOE-Ciudadanos', 'PSOE-VOX','Ciudadanos-PP', 'PP-VOX', 'VOX-PSOE', 
                                                   'PP-PSOE', 'PSOE-PP'])]
        
        data['bot_political_party'] = data.bot_political_party.replace({'VOX-Ciudadanos':'VOX-Ciudadanos',
                                                                       'Ciudadanos-VOX':'VOX-Ciudadanos',
                                                                       'PSOE-UP': 'UP-PSOE',
                                                                       'UP-PSOE': 'UP-PSOE',
                                                                       'VOX-UP': 'VOX-UP',
                                                                       'UP-VOX': 'VOX-UP',        
                                                                       'UP-Ciudadanos': 'UP-Ciudadanos',
                                                                       'Ciudadanos-UP': 'UP-Ciudadanos',
                                                                       'VOX-PP':'VOX-PP',
                                                                       'PP-VOX':'VOX-PP',
                                                                       'UP-PP':'UP-PP',
                                                                       'PP-UP':'UP-PP',
                                                                       'Ciudadanos-PSOE':'PSOE-Ciudadanos',
                                                                       'PSOE-Ciudadanos':'PSOE-Ciudadanos',
                                                                       'Ciudadanos-PP':'PP-Ciudadanos',
                                                                       'PP-Ciudadanos':'PP-Ciudadanos',
                                                                       'PSOE-VOX':'VOX-PSOE',
                                                                       'VOX-PSOE':'VOX-PSOE', 
                                                                       'PSOE-PP':'PSOE-PP',
                                                                       'PP-PSOE':'PSOE-PP'})  
    else:
        data = data[data.bot_political_party.isin(['UP','VOX','PSOE','PP','Ciudadanos'])]
        data['bot_political_party'] = data.bot_political_party.replace({'Ciudadanos':'CS'})
    
    return data

def zfil(text):
    """Auxiliar function to pad string on the left with zeros to fill a width of 2.
    
    Keyword arguments:
    text -- string to be padded
    """
    return text.zfill(2)


def calculate_appearances(data):
    """Generates an ordered dataframe by date, containing political party occurrences within each date
    
    Keyword arguments:
    data -- DataFrame with dates of creation of the first tweet of each bot and associated political party.
    """
    
    # create day and month columns from created_at
    data['day'] = data['created_at'].dt.day
    data['month'] = data['created_at'].dt.month
    #data.drop(['_id','created_at'],axis='columns',inplace=True)
    
    # calculate ocurrences by month, day and political party
    ocurrences = data.groupby(by=['month','day','bot_political_party']).size()
    ocurrences = ocurrences.reset_index()
    ocurrences.columns = ['month','day','political party','count']
    
    # force days to have two digits
    ocurrences['day'] = ocurrences['day'].astype(str)
    ocurrences['day'] = ocurrences['day'].apply(zfil)
    
    # sort rows by date
    ocurrences['orden'] = ocurrences['month'].astype(str)  + ocurrences['day'].astype(str)
    ocurrences['orden'] = ocurrences['orden'].astype(int)
    ocurrences.sort_values(['orden','political party'],ascending=[True,True])
    ocurrences['date'] = ocurrences['month'].astype(str) + "-" + ocurrences['day'].astype(str)
    ocurrences.drop(['month','day'],axis='columns',inplace=True)
    
    return ocurrences

In [None]:
# Get colors for graphic
party_colors = {
    # one_party_version
    'Ciudadanos':'#fa5000',
    'PP':'#0bb2ff',
    'PSOE':'#f41c14',
    'UP':'#6b2d64',
    'VOX':'#7cbd2a',
    # all
    'Unknown':'#3c4245',
    'Unclear':'#719192',
    'Two-party affinity':'#fda50f',
    'One-party affinity':'#3f0403',
    # two_parties_version
    'VOX-Ciudadanos':'#bb8715',
    'UP-PSOE':'#b0243c',
    'VOX-UP':'#747547',
    'UP-Ciudadanos':'#b33f32',
    'VOX-PP':'#43b795',
    'UP-PP':'#3b6fb2',
    'PSOE-Ciudadanos':'#f7360a',
    'PP-Ciudadanos':'#828180',
    'VOX-PSOE':'#b86d1f',
    'PSOE-PP':'#80678a'
}


# drawing three versions of accumulative appearance of bots
for version in ['all','two_parties','one_party']:
    data = df.copy()
    data = preprocess_dataframe(data,version)
    print(version,"; number of bots:",len(data),"; possible classes:",len(data['bot_political_party'].unique()))
    
    # preprocess_columns
    data = calculate_appearances(data)
    
    # keep labels with dates
    dates = []
    for date in data['date'].astype(str).unique():
        dates.append(date)
    
    # keep labels with political parties
    parties = []
    for party in data['political party'].unique():
        parties.append(party)
    
    # create a list of lists: one per political party ocurrences by date
    parties_ocurrences = []
    for party in parties:
        count = []
        df_party = data[data['political party']==party]
        for index,date in enumerate(dates):
            political_party_date_row = df_party[df_party['date']==date]
            
            # acumulative count of users per date
            exist_date = political_party_date_row.shape[0]==1
            if exist_date:
                if index==0:
                    count.append(political_party_date_row['count'].iloc[0])
                else:
                    count.append(count[index-1]+political_party_date_row['count'].iloc[0])
            else:
                if index==0:
                    count.append(0)
                else:
                    count.append(count[index-1])
                    
        parties_ocurrences.append(count)
        
  
    colors = []
    for party in parties:
        colors.append(party_colors[party])
    
    # Draw graphics
    plt.figure(figsize=(12,3))
    plt.xticks(rotation='vertical')
    plt.stackplot(dates,parties_ocurrences,labels=parties,colors=colors,edgecolor='black')
    plt.title("Cumulative distribution of bots according to the first day of detection ("+version+")")
    plt.legend(loc='upper left')
    plt.savefig(GRAPHICS_DIR+version+"_cumulative_distributions_bots_first_detection.pdf",bbox_inches = "tight")
    plt.show()

## Bots daily activities

In [None]:
def load_bot_users(user_collection):
    """Extracts the ObjectID and political party of bots
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    print("Query", end=" ")
    bot_users = user_collection.find({'bot_political_party':{'$exists':True}},
                                  {'_id':1,'bot_political_party':1})
    print("OK; List", end=" ")
    bot_users = list(bot_users)
    print("OK; Total bot users:", len(bot_users))
    return bot_users

def load_tweets(tweet_collection):
    """Extracts the ObjectID, date of creation and user id of tweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    print("Query", end=" ")
    tweets = tweet_collection.find({},
                            {'_id':1,'created_at':1,'user_id':1})
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets

def reindex_by_date(df):
    """Reindexes a DataFrame by date
    
    Keyword arguments:
    df -- DataFrame to be reindexed by date
    """
    dates = pd.date_range(xlim[0], xlim[1],name='created_at')
    return df.reindex(dates,fill_value=0)

def daterange(start_date, end_date):
    """Creates a generator of days to be used in loops
    
    Keyword arguments:
    start_date -- First date to be generated
    end_date -- Last date to be generated
    """
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

# oficial party colors
party_colors = {
    'Ciudadanos':'#fa5000', 'CS':'#fa5000',
    'PP':'#0bb2ff',
    'PSOE':'#f41c14',
    'UP':'#6b2d64',
    'VOX':'#7cbd2a'
}

In [None]:
%%time
bot_users = load_bot_users(db.users)
df_bot_users = pd.DataFrame(bot_users)
df_bot_users = df_bot_users[df_bot_users.bot_political_party.isin(['VOX','Ciudadanos','PP','PSOE','UP'])]
df_bot_users.columns = ['_id','political_party']
display(df_bot_users.head(2))

In [None]:
%%time
tweets = load_tweets(db.tweets)
tweets = [flatten(t) for t in tweets]
df_tweets = pd.DataFrame(tweets)

In [None]:
%%time
df_bot_tweets = df_tweets[df_tweets['user_id'].isin(df_bot_users['_id'])]
df_bot_tweets = df_bot_tweets.join(df_bot_users.set_index('_id'), on='user_id') ## append political party to interaction
df_bot_tweets.dropna(axis='index',inplace=True)
df_bot_tweets.created_at = df_bot_tweets.created_at.astype("datetime64")
display(df_bot_tweets.head(2))

### Active bots per day

In [None]:
# limits of x-axis
xlim = (df_bot_tweets.created_at.dt.date.min(), df_bot_tweets.created_at.dt.date.max())

# Extract number of active bots per date and political party
data = df_bot_tweets.copy()
data['date'] = data.created_at.dt.date
data.set_index(data.created_at, drop=True, inplace=True)
data = data.groupby(['date','political_party']).agg({'user_id':pd.Series.nunique})
data = pd.DataFrame(data.to_records())
data.columns = ['Date','Political party','Active bots']

In [None]:
# plot
plt.figure(figsize=(15,5))
ax = sns.lineplot(data=data, x='Date', y='Active bots', hue='Political party', hue_order=['UP','PSOE','Ciudadanos','PP','VOX'],palette=party_colors)
plt.legend(loc='upper left')
plt.title('Bot activity')
plt.xlim(left=xlim[0], right=xlim[1])
ax.set_ylabel("Number of active bots")
ax.xaxis.set_major_locator(mdates.DayLocator())
ax.xaxis.set_tick_params(rotation=75)
#plt.savefig(GRAPHICS_DIR+"timeline/"+version+"-traffic-timeline-original.pdf", bbox_inches = "tight")
plt.tight_layout()
plt.show()

### Bots' activity windows

In [None]:
# get the first day and last day of activity of bots
data2 = df_bot_tweets.copy()
display(data2.head())
data2 = data2.groupby(['user_id','political_party']).agg({'created_at':['min','max']})
data2 = pd.DataFrame(data2.to_records())
data2.columns = ['user_id','political_party','min','max']
data2.info()

In [None]:
# transform timestamp y date
data2['min']= data2['min'].dt.date
data2['max'] = data2['max'].dt.date
display(data2.head())

In [None]:
# get start date and end date from dataframe
start = min(data2['min'])
end = max(data2['max'])

# initializes a dictionary of dates
activity = {}
for d in daterange(start,end+timedelta(1)):
    # each date has another dictionary with each political party
    activity[d] = {}
    for party,party_group in data2.groupby('political_party'):
        activity[d][party] = 0

# the dictionary activity is filled with active users
# in this case, user activity is the range from the date of the first iteration (min_date) and the last iteration (max_date)
# a bot is considered to be active in a specific day d if min_date <= d <= max_date
for d in daterange(start,end+timedelta(1)):
    for party,party_group in data2.groupby('political_party'):
        for min_date, max_date in zip(party_group['min'],party_group['max']):
            if min_date <= d and d <= max_date:
                activity[d][party] = activity[d][party]+1

In [None]:
# activity dictionary is converted into a DataFrame with active bots per date and political party
df = pd.DataFrame.from_dict(data=activity,orient='index')
df.reset_index(inplace=True)
df = df.melt(id_vars=['index'],value_vars=['Ciudadanos','PP','PSOE','UP','VOX'])
df.columns = ['Date','Political party','Active Social Bots']
df.head(2)

In [None]:
# plot
plt.figure(figsize=(15,5))
w = -0.3

# bot daily activity in barplots
for party in ['UP','PSOE','Ciudadanos','PP','VOX']:
    plt.bar(date2num(data[data['Political party']==party]['Date'])+w,height=data[data['Political party']==party]['Active bots'],color=party_colors[party],width=0.15,align='center')
    w=0.15+w
    
# bot range activity in lineplots
ax = sns.lineplot(data=df, x='Date', y='Active Social Bots', hue='Political party', hue_order=['UP','PSOE','Ciudadanos','PP','VOX'], palette=party_colors)
plt.legend(loc='upper left')
plt.title('Bot activity')
plt.xlim(left=xlim[0], right=xlim[1])
ax.set_ylabel("Number of active bots")
ax.xaxis.set_major_locator(mdates.DayLocator())
ax.xaxis.set_tick_params(rotation=75)
#plt.savefig(GRAPHICS_DIR+"timeline/"+version+"-traffic-timeline-original.pdf", bbox_inches = "tight")
plt.tight_layout()
plt.show()

### Bots daily activity with cumulative metric

In [None]:
# Activity per political party and date
df_activity_per_day = df_bot_tweets.copy()
df_activity_per_day['date'] = df_activity_per_day.created_at.dt.date
df_activity_per_day.set_index(df_activity_per_day.created_at, drop=True, inplace=True)
df_activity_per_day = df_activity_per_day.groupby(['date','political_party']).agg({'user_id':pd.Series.nunique})
df_activity_per_day = pd.DataFrame(df_activity_per_day.to_records())
df_activity_per_day.columns = ['Date','Political party','Active Social Bots']

In [None]:
# Cumulative activity per political party and date
df_cumulative = df_bot_tweets.copy()
df_cumulative['created_at']= df_cumulative['created_at'].dt.date
start = min(df_cumulative['created_at'])
end = max(df_cumulative['created_at'])
print(start,end)
df_cumulative = df_cumulative.groupby(['user_id','political_party']).agg({'created_at':'min'})
df_cumulative = pd.DataFrame(df_cumulative.to_records())
df_cumulative.columns = ['user_id','political_party','min']

In [None]:
# cumulative dictionary initialization
cumulative = {}
for d in daterange(start,end+timedelta(1)):
    cumulative[d] = {}
    for party,party_group in df_cumulative.groupby('political_party'):
        cumulative[d][party] = 0

# the dictionary cumulative is filled with active users
# in cumulative activity, the presence of a bot is considere from its first iteration (min_date)
# a bot is considered to be active in a specific day d if min_date <= d
for d in daterange(start,end+timedelta(1)):
    for party,party_group in df_cumulative.groupby('political_party'):
        if d>start:
            cumulative[d][party] = cumulative[d-timedelta(1)][party]    
        for min_date in party_group['min']:
            if min_date == d:
                cumulative[d][party] = cumulative[d][party]+1

In [None]:
# cumulative dictionary is transformed into a dataframe with the cumulative bots activity per date and political party
df2 = pd.DataFrame.from_dict(data=cumulative,orient='index')
df2.reset_index(inplace=True)
df2 = df2.melt(id_vars=['index'],value_vars=['Ciudadanos','PP','PSOE','UP','VOX'])
df2.columns = ['Date','Political party','Cumulative Social Bots Activity']

In [None]:
# we replace the word 'Ciudadanos' with its acronym
df2['Political party'].replace({'Ciudadanos':'CS'}, inplace=True)
df_activity_per_day['Political party'].replace({'Ciudadanos':'CS'}, inplace=True)

# plot bots activity per day in barplots
plt.figure(figsize=(15,5))
w = -0.3
for party in ['UP','PSOE','CS','PP','VOX']:
    plt.bar(date2num(df_activity_per_day[df_activity_per_day['Political party']==party]['Date'])+w,height=df_activity_per_day[df_activity_per_day['Political party']==party]['Active Social Bots'],color=party_colors[party],width=0.15,align='center')
    w=0.15+w

# plot bots accumulative activity per day in lineplots
ax = sns.lineplot(data=df2, x='Date', y='Cumulative Social Bots Activity', hue='Political party', hue_order=['UP','PSOE','CS','PP','VOX'], palette=party_colors)
plt.legend(loc='upper left')
plt.title('Bot activity', **title_font)
plt.xlim(left=start-timedelta(1), right=end+timedelta(1))
ax.set_ylabel("Number of active bots")
ax.xaxis.set_major_locator(mdates.DayLocator())
ax.xaxis.set_tick_params(rotation=75)
plt.tight_layout()
plt.savefig(GRAPHICS_DIR+"activity-timeline.pdf", bbox_inches = "tight")
plt.show()

## Generated traffic

In [None]:
def get_bot_tweets(user_collection):
    """Extracts the ObjectID, user id, user's political party, tweet type, date of creation, retweet count and favourite count of bot interactions
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    p95 = 0.6908019160064479
    
    pipeline = [
    {
        '$match': {
            'scores.scores.universal': {
                '$gte': p95
            }
        }
    }, {
        '$lookup': {
            'from': 'tweets', 
            'localField': '_id', 
            'foreignField': 'user_id', 
            'as': 'tweets'
        }
    }, {
        '$unwind': {
            'path': '$tweets'
        }
    }, {
        '$project': {
            '_id': 0, 
            'user_id': '$_id', 
            'bot_political_party': True, 
            'tweet_type': '$tweets.tweet_type', 
            'date': '$tweets.created_at', 
            'retweet_count': '$tweets.retweet_count', 
            'favorite_count': '$tweets.favorite_count'
        }
    }
    ]
    
    print("Processing MONGO request",end="; ")
    tweets = user_collection.aggregate(pipeline)
    print("OK",end="; ")
    tweets = list(tweets)
    print("Length",len(tweets),end=";")
    return tweets

def reindex_by_date(df):
    """Reindexes a DataFrame by date
    
    Keyword arguments:
    df -- DataFrame to be reindexed by date
    """
    dates = pd.date_range(xlim[0], xlim[1],name='created_at')
    return df.reindex(dates,fill_value=0)

In [None]:
bot_tweets = get_bot_tweets(db.users)

In [None]:
df = pd.DataFrame(bot_tweets)
df.date = df.date.astype("datetime64")
df.bot_political_party.fillna('Unknown',inplace=True)
display(df.head(6))
df.info()

In [None]:
# Get limits of x axis
xlim = (df.date.dt.date.min(), df.date.dt.date.max())

### Bot's traffic volumes per political party

Draw three charts:
1. all bots,
2. bots with two political parties,
3. bots with single political party (paper version)

In [None]:
# Get colors for graphic
party_colors = {
    # one_party_version
    'CS':'#fa5000',
    'PP':'#0bb2ff',
    'PSOE':'#f41c14',
    'UP':'#6b2d64',
    'VOX':'#7cbd2a',
    # all
    'Unknown':'#3c4245',
    'Unclear':'#719192',
    'Two-party affinity':'#fda50f',
    'One-party affinity':'#3f0403',
    # two_parties_version
    'VOX-Ciudadanos':'#bb8715',
    'UP-PSOE':'#b0243c',
    'VOX-UP':'#747547',
    'UP-Ciudadanos':'#b33f32',
    'VOX-PP':'#43b795',
    'UP-PP':'#3b6fb2',
    'PSOE-Ciudadanos':'#f7360a',
    'PP-Ciudadanos':'#828180',
    'VOX-PSOE':'#b86d1f',
    'PSOE-PP':'#80678a'
}

In [None]:
'''
plot three versions of generated traffic
    all: no filtering, political parties are classified as 'Unknown', 'Unclear', 'Two-party affinity' or 'One-party affinity','two parties',
    two parties: filters only bots with two-party affinity, grouping partyA-partyB and partyB-partyA in the same group
    one party: filters only bots with one-party affinity
'''
for version in ['all','two_parties','one_party']:
    data = df.copy()
    data = preprocess_dataframe(data,version)

    
    plt.figure(figsize=(15,3))
    for party,party_group in data.groupby(by=['bot_political_party']):
        grp = party_group.copy()
        grp = grp[grp.tweet_type.isin(['original','reply','quote'])]
        grp.set_index(grp.date, drop=True, inplace=True)
        k1 = grp.groupby(grp.date.dt.date).agg({'bot_political_party':'count', 'retweet_count':'sum'}).apply(reindex_by_date)
        ax1 = sns.lineplot(data=k1, x=k1.index, y=k1.bot_political_party, label=party, color=party_colors[party])
        
    if version == 'one_party':
        plt.legend(loc='upper left', title="Political party")
    else:
        plt.legend(bbox_to_anchor=(1,1), title="Party")
    plt.title(version+" "+"original, reply, quote")
    plt.xlim(left=xlim[0], right=xlim[1])
    plt.tight_layout()
    ax1.xaxis.set_major_locator(mdates.DayLocator())
    ax1.xaxis.set_tick_params(rotation=75)
    #plt.savefig(GRAPHICS_DIR+"timeline/"+version+"-traffic-timeline-original.pdf", bbox_inches = "tight")
    plt.show()
    
    plt.figure(figsize=(15,3))
    for party,party_group in data.groupby(by=['bot_political_party']):
        grp = party_group.copy()
        grp = grp[grp.tweet_type.isin(['original','reply','quote'])]
        grp.set_index(grp.date, drop=True, inplace=True)
        k1 = grp.groupby(grp.date.dt.date).agg({'bot_political_party':'count', 'retweet_count':'sum'}).apply(reindex_by_date)
        ax2 = sns.lineplot(data=k1, x=k1.index, y=k1.retweet_count, label=party, color=party_colors[party])
        ax2.fill_between(k1.index, 0, k1.retweet_count, color=party_colors[party], alpha=0.2)
        
    if version == 'one_party':
        plt.legend(loc='upper left', title="Political party")
    else:
        plt.legend(bbox_to_anchor=(1,1), title="Party")
        
    plt.title(version+" "+"associated retweets")
    plt.xlim(left=xlim[0], right=xlim[1])
    plt.tight_layout()
    ax2.xaxis.set_major_locator(mdates.DayLocator())
    ax2.xaxis.set_tick_params(rotation=75)
    #plt.savefig(GRAPHICS_DIR+"timeline/"+version+"-traffic-timeline-associated-retweets.pdf", bbox_inches = "tight")
    plt.show()
    
    plt.figure(figsize=(15,3))
    for party,party_group in data.groupby(by=['bot_political_party']):
        grp2 = party_group.copy()
        grp2 = grp2[grp2.tweet_type=='retweet']
        grp2.set_index(grp2.date, drop=True, inplace=True)
        k2 = grp2.groupby(grp2.date.dt.date).agg({'bot_political_party':'count'}).apply(reindex_by_date)
        ax3 = sns.lineplot(data=k2, x=k2.index, y=k2.bot_political_party, label=party, color=party_colors[party])
        
    if version == 'one_party':
        plt.legend(loc='upper left', title="Political party")
    else:
        plt.legend(bbox_to_anchor=(1,1), title="Party")
        
    plt.legend(bbox_to_anchor=(1,1), title="Party")
    plt.title(version+" "+"total retweets")
    plt.xlim(left=xlim[0], right=xlim[1])
    plt.tight_layout()
    ax3.xaxis.set_major_locator(mdates.DayLocator())
    ax3.xaxis.set_tick_params(rotation=75)
    #plt.savefig(GRAPHICS_DIR+"timeline/"+version+"-traffic-timeline-total-retweets.pdf", bbox_inches = "tight")
    plt.show()

In [None]:
### Pretty representation for PAPER
data = df.copy()
data = preprocess_dataframe(data,"one_party")
plt.figure(figsize=(15,3))

for party in ['UP','PSOE','CS','PP','VOX']:
    grp = data[data.bot_political_party==party].copy()
    grp = grp[grp.tweet_type.isin(['original','reply','quote'])]
    grp.set_index(grp.date, drop=True, inplace=True)
    k1 = grp.groupby(grp.date.dt.date).agg({'bot_political_party':'count', 'retweet_count':'sum'}).apply(reindex_by_date)
    ax1 = sns.lineplot(data=k1, x=k1.index, y=k1.bot_political_party, label=party, color=party_colors[party])

plt.legend(loc='upper left', title="Political party")
plt.title('Originals, quotes and replies')
plt.ylabel("Number of interactions")
plt.xlabel("Date")
plt.xlim(left=xlim[0], right=xlim[1])
plt.tight_layout()
ax1.xaxis.set_major_locator(mdates.DayLocator())
ax1.xaxis.set_tick_params(rotation=75)
plt.savefig(GRAPHICS_DIR+"traffic-timeline-1.pdf", bbox_inches = "tight")
plt.show()

plt.figure(figsize=(15,3))
for party in ['UP','PSOE','CS','PP','VOX']:
    grp = data[data.bot_political_party==party].copy()
    grp = grp[grp.tweet_type.isin(['original','reply','quote'])]
    grp.set_index(grp.date, drop=True, inplace=True)
    k1 = grp.groupby(grp.date.dt.date).agg({'bot_political_party':'count', 'retweet_count':'sum'}).apply(reindex_by_date)
    ax2 = sns.lineplot(data=k1, x=k1.index, y=k1.retweet_count, label=party, color=party_colors[party])
    ax2.fill_between(k1.index, 0, k1.retweet_count, color=party_colors[party], alpha=0.2)

plt.legend(loc='upper left', title="Political party")
#plt.legend().set_visible(False)
plt.title('Originals, quotes and replies\' retweets')
plt.ylabel("Number of associated retweets")
plt.xlim(left=xlim[0], right=xlim[1])
plt.xlabel("Date")
plt.tight_layout()
ax2.xaxis.set_major_locator(mdates.DayLocator())
ax2.xaxis.set_tick_params(rotation=75)
plt.savefig(GRAPHICS_DIR+"traffic-timeline-2.pdf", bbox_inches = "tight")
plt.show()

plt.figure(figsize=(15,3))
for party in ['UP','PSOE','CS','PP','VOX']:
    grp2 = data[data.bot_political_party==party].copy()
    grp2 = grp2[grp2.tweet_type=='retweet']
    grp2.set_index(grp2.date, drop=True, inplace=True)
    k2 = grp2.groupby(grp2.date.dt.date).agg({'bot_political_party':'count'}).apply(reindex_by_date)
    ax3 = sns.lineplot(data=k2, x=k2.index, y=k2.bot_political_party, label=party, color=party_colors[party])

plt.legend(loc='upper left', title="Political party")
#plt.legend().set_visible(False)
plt.title("Total retweets")
plt.xlim(left=xlim[0], right=xlim[1])
plt.ylabel("Number of retweets")
plt.xlabel("Date")
plt.tight_layout()
ax3.xaxis.set_major_locator(mdates.DayLocator())
ax3.xaxis.set_tick_params(rotation=75)
#plt.savefig(GRAPHICS_DIR+"traffic-timeline-3.pdf", bbox_inches = "tight")
plt.show()

## Sentiment analysis towards party themes

In [None]:
def load_labeled_users(user_collection):
    """Extracts the ObjectID and political party of manually labeled users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    print("Query", end=" ")
    labeled_users = user_collection.find({'political_party':{'$exists':True}},
                                  {'_id':1,'political_party':1})
    print("OK; List", end=" ")
    labeled_users = list(labeled_users)
    print("OK; Total labeled users:", len(labeled_users))
    return labeled_users

def load_bot_users(user_collection):
    """Extracts the ObjectID and political party of bots
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    p95 = 0.6908019160064479
    print("Query", end=" ")
    bot_users = user_collection.find({'bot_political_party':{'$exists':True}},
                                  {'_id':1,'bot_political_party':1})
    print("OK; List", end=" ")
    bot_users = list(bot_users)
    print("OK; Total bot users:", len(bot_users))
    return bot_users

def load_tweets(tweet_collection):
    """Extracts the ObjectID, tweet type, bag-of-words, sentiment score and user id of all tweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    print("Query", end=" ")
    tweets = tweet_collection.find({},
                            {'_id':1,'tweet_type':1,'keywords_summary':1,'sentiment_score':1,'user_id':1})
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets

In [None]:
%%time
labeled_users = load_labeled_users(db.users)
df_labeled_users = pd.DataFrame(labeled_users)
display(df_labeled_users.head(2))

In [None]:
%%time
bot_users = load_bot_users(db.users)
df_bot_users = pd.DataFrame(bot_users)
df_bot_users = df_bot_users[df_bot_users.bot_political_party.isin(['VOX','Ciudadanos','PP','PSOE','UP'])]
df_bot_users.columns = ['_id','political_party']
display(df_bot_users.head(2))

In [None]:
%%time
tweets = load_tweets(db.tweets)
tweets = [flatten(t) for t in tweets]
df_tweets = pd.DataFrame(tweets)

In [None]:
%%time
# get tweets of manually labeled users by joining two DataFrames through 'user_id' key
df_labeled_tweets = df_tweets[df_tweets['user_id'].isin(df_labeled_users['_id'])]
df_labeled_tweets = df_labeled_tweets.join(df_labeled_users.set_index('_id'), on='user_id') ## append political party to interaction
df_labeled_tweets.dropna(axis='index',inplace=True)
display(df_labeled_tweets.head(2))

In [None]:
%%time
# get tweets of bot users by joining two DataFrames through 'user_id' key
df_bot_tweets = df_tweets[df_tweets['user_id'].isin(df_bot_users['_id'])]
df_bot_tweets = df_bot_tweets.join(df_bot_users.set_index('_id'), on='user_id') ## append political party to interaction
df_bot_tweets.dropna(axis='index',inplace=True)
display(df_bot_tweets.head(2))

In [None]:
parties = ['VOX','PP','Ciudadanos','PSOE','UP']
colors ={'VOX':'#7cbd2a',
        'PP':'#0bb2ff',
        'Ciudadanos':'#fa5000',
        'PSOE':'#f41c14',
        'UP':'#6b2d64'}

def filter_single_party_interactions(df):
    '''
    Filters the DataFrame to maintain only tweets dealing with a single political party. The mentioned party is inserted in a new column 'passive_political_party'
    
    Keyword arguments:
    df -- DataFrame of tweets containing at least the ObjectID (_id) and keyword summaries of parties (keywords_summary_P, where P in parties)
    '''
    print("Dataframe with",len(df),"records",end="; ")
    for p in parties:
        passive_political_party_group = df[df['keywords_summary_'+p]==True]
        for p2 in parties:
            if p2 is not p:
                passive_political_party_group = passive_political_party_group[passive_political_party_group['keywords_summary_'+p2]==False]

        idxs = passive_political_party_group.index.values

        for idx in idxs:
            df.at[idx,'passive_political_party'] = p
    
    df.dropna(axis='index',inplace=True)
    print("Filtered dataframe with",len(df),"records")
    df['passive_political_party'] = df.passive_political_party.astype('category')
    return df

In [None]:
# manually labeled sentiments
data = filter_single_party_interactions(df_labeled_tweets)

In [None]:
# bots sentiments
data2 = filter_single_party_interactions(df_bot_tweets)

### Sentiment score per political party

In [None]:
'''
make stripplot with originals, replies and quotes per political party and party theme for:
 - manually labeled users
 - bots
 
In X-axis, the political party of users
In Y-axis, the sentiment score of each tweet
The color indicates which party the tweet is about
'''
for d in ['Manually labeled','Bot']:
    if d=='Manually labeled':
        paint = data
    else:
        paint = data2
        
    plt.figure(figsize=(15,4))
    sns.stripplot(x="political_party", 
                  y="sentiment_score", 
                  hue="passive_political_party", 
                  order=['UP','PSOE','Ciudadanos','PP','VOX'],
                  hue_order=['UP','PSOE','Ciudadanos','PP','VOX'],
                  palette=colors, 
                  data=paint[paint.tweet_type.isin(['original','reply','quote'])],
                  jitter=0.25,
                  dodge=True)
    
    plt.legend(bbox_to_anchor=(1,1), title="Party theme")
    plt.title(d+' users: generated originals, replies and quotes')
    plt.xlabel(d+" political party")
    plt.ylabel("Sentiment score")
    plt.savefig(GRAPHICS_DIR+d+"-sentiment-distribution.pdf", bbox_inches = "tight")
    plt.show()

### Sentiment score per political party (boxplot)

In [None]:
'''
make boxplots of original, reply and quote sentiments per political party and party theme for:
 - manually labeled users
 - bots
 
In X-axis, the political party of users
In Y-axis, the sentiment score of each tweet
The color indicates which party the tweet is about
'''

fig, axs = plt.subplots(nrows=2, ncols=1,figsize=(15,8))

i=0
for d in ['Manually labeled','Social Bots']:
    if d=='Manually labeled':
        paint = data
        xlb =''
        pad=30
    else:
        paint = data2
        xlb = 'User\'s political party'
        pad=8
    
    paint.political_party.replace({'Ciudadanos':'CS'},inplace=True)
    paint.passive_political_party.replace({'Ciudadanos':'CS'},inplace=True)

    sns.boxplot(x="political_party", 
                  y="sentiment_score", 
                  hue="passive_political_party", 
                  order=['UP','PSOE','CS','PP','VOX'],
                  hue_order=['UP','PSOE','CS','PP','VOX'],
                  palette=colors, 
                  data=paint[paint.tweet_type.isin(['original','reply','quote'])],
                  showfliers=False,
                  dodge=True,
                  ax=axs[i])
    
    axs[i].set_xlabel(xlb)
    axs[i].set_ylabel("Sentiment score")
    axs[i].set_title(d+' users',loc='center', pad=pad, fontweight='bold')
    i=1

fig.tight_layout(pad=2.0)
axs[1].get_legend().remove()
axs[0].legend(loc='upper right',bbox_to_anchor=(1,1.25), title="Party theme", fancybox=True, ncol=5)
#plt.savefig(GRAPHICS_DIR+"sentiment-boxplots.pdf", bbox_inches = "tight")
#plt.tight_layout()
plt.show()