# Phase 3. Statistical Information

## Contents
- [Configuration](#Configuration)
  - [Imports](#Imports)
  - [Variables](#Variables)
  - [Support functions](#Support-functions)
- [Botscore distribution](#Botscore-distribution)
- [Daily total traffic](#Daily-total-traffic)
- [Tweet type distributions per botscore authorship](#Tweet-type-distributions-per-botscore-authorship)
- [Number of users per number of interactions](#Number-of-users-per-number-of-interactions)
- [Interactions between humans and bots](#Interactions-between-humans-and-bots)

## Configuration

### Imports

In [None]:
from IPython.display import display
from fastprogress import master_bar, progress_bar
import os
import pickle

# Graphic utilities
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=0.9)
sns.set_style("whitegrid")
sns.set_style({'font.family':'monospace'})
from pylab import *

# Dataframes
import pandas as pd
from collections.abc import MutableMapping
import numpy as np

# MongoDB functionality
from pymongo.errors import BulkWriteError
from pymongo import MongoClient
from bson import ObjectId


### Variables

In [None]:
# If true exports vectorial PDFs instead of JPG.
VECTORIAL_FIGURES = True
FIG_EXTENSION = ".pdf" if VECTORIAL_FIGURES else ".jpg"

# Directories where CSV data is stored
ROOT_DIR = "ABOSLUTE_PATH_TO_ROOT_FOLDER"
DATA_DIR = ROOT_DIR + "data/"
GRAPHICS_DIR = ROOT_DIR + "graphics/"

# Change path to root
os.chdir(ROOT_DIR)

try:
    os.makedirs(GRAPHICS_DIR)
except FileExistsError:
    # directory already exists
    pass

# MongoDB parameters
mongoclient = MongoClient('IP_ADDRESS', PORT)
db = mongoclient.DATABASE_NAME
# It will automatically create the tweets' and users' collections.

#### Support Functions

In [None]:
def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    text = str(text)
    if not text.strip():
        return None
    try:
        return ObjectId(text.rjust(24,"0"))
    except Exception as ex:
        print(text, ex)
        return None

def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# color palette
colors = ["#C44E52", #red
          "#55A868", #green
          "#FFC400", #yellow
          "#4C72B0", #blue
          "#DD8452", #orange
          "#8172B3", #purple
          "#64B5CD", #cyan
          "#937860", #brown
          "#8C8C8C", #gray
         ]

## Botscore distribution

In [None]:
%%time
def load_users(user_collection):
    """Extracts the ObjectID and botscore of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'scores': {
                            '$exists': True
                        },
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'scores.scores.universal': True
                    }
                }
            ]
    print("Query", end=" ")
    users = user_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total users:", len(users))
    return users

In [None]:
%%time
users = load_users(db.users)
users = [flatten(u) for u in users]
df_users = pd.DataFrame(users)
df_users.columns = ['uid', 'universal']
p75 = np.percentile(df_users.universal,75)
p95 = np.percentile(df_users.universal,95)
print("Limits:", p75, p95)

In [None]:
print('Humans:',len(df_users[df_users.universal<=p75]))
print('Bots:',len(df_users[df_users.universal>=p95]))

In [None]:
# we round the botscore to discretize
df_users.universal = np.round(df_users.universal,2)

In [None]:
# plot botscore distribution

fig, axs = plt.subplots(3, 1, figsize=(5,10))

# subfigure 1
g = axs[0]
g = sns.distplot(df_users.universal, 
                 bins=50,
                 hist=True, 
                 kde=False, 
                 ax=g, 
                 #label=tweet_type.capitalize(), 
                 color=colors[3])
g.set_ylabel(fontsize="small", ylabel='Number of accounts')
g.set_xlabel(fontsize="small", xlabel='User\'s Universal Score')
g.title.set_text("All users")

# subfigure 2
h = axs[1]
h = sns.distplot(df_users.universal, 
                 bins=50,
                 hist=True, 
                 kde=False, 
                 ax=h, 
                 #label=tweet_type.capitalize(), 
                 hist_kws={'cumulative': True},
                 color=colors[3])
h.set_ylabel(fontsize="small", ylabel='Cumulative number of accounts')
h.set_xlabel(fontsize="small", xlabel='User\'s Universal Score')

# subfigure 3
i = axs[2]
i = sns.boxenplot(df_users.universal, 
                ax=i, 
                color=colors[3])
i.set_ylabel(fontsize="small", ylabel='Score distribution')
i.set_xlabel(fontsize="small", xlabel='User\'s Universal Score')


# vertical lines for percentiles
g_ylims = g.get_ylim()
h_ylims = h.get_ylim()
i_ylims = i.get_ylim()
prs = [50,75,90,95,99]
ps = np.percentile(a=df_users.universal, q=prs)
for pr,p in zip(prs,ps):
    label = str(pr) + "th Perc."
    g.vlines(x=p, ymin=g_ylims[0], ymax=g_ylims[1], linewidth = 2, label=label, linestyles='dotted')
    h.vlines(x=p, ymin=h_ylims[0], ymax=h_ylims[1], linewidth = 2, linestyles='dotted')
    i.vlines(x=p, ymin=i_ylims[0], ymax=i_ylims[1], linewidth = 2, linestyles='dotted')

# common legend
g.legend(loc="upper right")
h.legend(loc="lower right")

# figure 
fig.suptitle("Botscore distribution", y=1.01)
plt.tight_layout()
plt.savefig(GRAPHICS_DIR + "Botscores-distribution" + FIG_EXTENSION, bbox_inches = "tight")
plt.show()

## Daily total traffic

In [None]:
def load_tweets(tweet_collection):
    """Extracts the ObjectID and date of creation of all tweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    print("Query", end=" ")
    tweets = tweet_collection.find({},
                            {'_id':1,'created_at':1})
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets

In [None]:
tweets = load_tweets(db.tweets)

In [None]:
df_tweets = pd.DataFrame(tweets)
df_tweets['date'] = df_tweets.created_at.dt.date   # we extract the date (year-day-month) from timestamp (created_at)
df_tweets.head(2)

In [None]:
start = min(df_tweets['date'])     # first day
end = max(df_tweets['date'])       # last day

# count of interactions per day
df_dates = pd.DataFrame(df_tweets.groupby('date').agg({'_id':'count'}))
df_dates.reset_index(inplace=True)

In [None]:
# plot traffic per day, blue color according to traffic volume
pal = sns.color_palette("Blues_d", len(df_dates))
pal2 = []
df_dates2 = df_dates.sort_values(by='_id',axis='index',ascending=False).reset_index().sort_values(by='date',axis='index')
for _id in df_dates2.index:
    pal2.append(pal[_id])

plt.figure(figsize=(15,5))
ax=sns.barplot(x=df_dates.date, y=df_dates._id, palette=pal2)
ax.xaxis.set_tick_params(rotation=75)
plt.tight_layout()
plt.xlabel('Date')
plt.ylabel('Number of interactions')
plt.title("Total traffic during collecting period")
plt.savefig(GRAPHICS_DIR + "total-traffic-timeline" + FIG_EXTENSION,bbox_inches = "tight")
plt.show()

## Tweet type distributions per botscore authorship

In [None]:
%%time
def load_tweets(tweet_collection):
    """
    Extracts all tweets with their tweet type and author botscore
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    pipeline = [
                {
                    '$lookup': {
                        'from': 'users', 
                        'localField': 'user_id', 
                        'foreignField': '_id', 
                        'as': 'usr'
                    }
                }, {
                    '$unwind': {
                        'path': '$usr', 
                        'preserveNullAndEmptyArrays': False
                    }
                }, {
                    '$match': {
                        'usr.scores': {
                            '$exists': True
                        }
                    }
                }, {
                    '$project': {
                        '_id': False, 
                        'tweet_type': True, 
                        'user_bot_score': '$usr.scores.scores.universal',#.cap.universal', 
                    }
                }
            ]
    print("Query", end=" ")
    tweets = tweet_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    tweets = list(tweets)  
    print("OK; Total records:", len(tweets))
    return tweets

In [None]:
%%time
tweets = load_tweets(db.tweets)

In [None]:
print("DF:",len(tweets))
tweets = [flatten(t) for t in tweets]
df_tweets = pd.DataFrame(tweets)
display(df_tweets.head(2))

In [None]:
# we round the botscore to discretize (for the graphic)
df_tweets['user_score_bin'] = np.around(df_tweets.user_bot_score,2)

# plot tweet type distributions
# each column is a tweet type, each row a type of graphic
fig, axs = plt.subplots(3, len(df_tweets.tweet_type.unique()), figsize=(15,10))
ax_idx = 0

# each iteration creates a column regarding a tweet type
grps = df_tweets.groupby(by=['tweet_type'])
for tweet_type,grp in grps:
    
    # first plot of the tweet type column
    g = axs[0][ax_idx]  
    g = sns.distplot(grp.user_score_bin, 
                     bins=50,
                     hist=True, 
                     kde=False, 
                     ax=g, 
                     label=tweet_type.capitalize(), 
                     color=colors[ax_idx])
    g.set_ylabel(fontsize="small", ylabel='Number of interactions')
    g.set_xlabel(fontsize="small", xlabel='User\'s Universal Score')
    g.title.set_text("Type:" + tweet_type)
    
    # second plot of the tweet type column
    h = axs[1][ax_idx] 
    h = sns.distplot(grp.user_score_bin, 
                     bins=50,
                     hist=True, 
                     kde=False, 
                     ax=h, 
                     label=tweet_type.capitalize(), 
                     hist_kws={'cumulative': True},
                     color=colors[ax_idx])
    h.set_ylabel(fontsize="small", ylabel='Cumulative number of interactions')
    h.set_xlabel(fontsize="small", xlabel='User\'s Universal Score')
    
    # third plot of the tweet type column
    i = axs[2][ax_idx]
    i = sns.boxenplot(grp.user_score_bin, 
                    ax=i, 
                    color=colors[ax_idx])
    i.set_ylabel(fontsize="small", ylabel='Interactions distribution')
    i.set_xlabel(fontsize="small", xlabel='User\'s Universal Score')
    
    
    # vertical lines for percentiles
    g_ylims = g.get_ylim()
    h_ylims = h.get_ylim()
    i_ylims = i.get_ylim()
    
    prs = [50,75,90,95,99]
    ps = np.percentile(df_users.universal, q=prs)
    for pr,p in zip(prs,ps):
        label = str(pr) + "th Perc."
        g.vlines(x=p, ymin=g_ylims[0], ymax=g_ylims[1], linewidth = 2, label=label, linestyles='dotted')
        h.vlines(x=p, ymin=h_ylims[0], ymax=h_ylims[1], linewidth = 2, linestyles='dotted')
        i.vlines(x=p, ymin=i_ylims[0], ymax=i_ylims[1], linewidth = 2, linestyles='dotted')
        
    # legends
    g.legend(loc="upper right")

    ax_idx+=1 

# figure parameters
fig.suptitle("Interactions distributions per tweet type and botscores", y=1.01)
plt.tight_layout()
plt.savefig(GRAPHICS_DIR + "Interactions_distribution_per_scores" + FIG_EXTENSION, bbox_inches = "tight")
plt.show()

## Number of users per number of interactions

In [None]:
def load_human_users(user_collection):
    """Extracts the ObjectID of human users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    p75 = 0.23633691139538376   # we consider humans to those users with a botscore equal or smaller to 75th percentile
    print("Query", end=" ")
    human_users = user_collection.find({'scores.scores.universal': {'$lte': p75}},
                                  {'_id':1})
    print("OK; List", end=" ")
    human_users = list(human_users)
    print("OK; Total human users:", len(human_users))
    return human_users

def load_bot_users(user_collection):
    """Extracts the ObjectID of bot users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    p95 = 0.6908019160064479   # we consider bots to those users with a botscore equal or greater than 95th percentile
    print("Query", end=" ")
    bot_users = user_collection.find({'scores.scores.universal': {'$gte': p95}},
                                  {'_id':1})
    print("OK; List", end=" ")
    bot_users = list(bot_users)
    print("OK; Total bot users:", len(bot_users))
    return bot_users

def load_tweets(tweet_collection):
    """Extracts the ObjectID, tweet type and user id of all tweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    print("Query", end=" ")
    tweets = tweet_collection.find({},
                            {'_id':1,'tweet_type':1,'user_id':1})
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets

In [None]:
%%time
human_users = load_human_users(db.users)
df_human_users = pd.DataFrame(human_users)
display(df_human_users.head(2))

In [None]:
%%time
bot_users = load_bot_users(db.users)
df_bot_users = pd.DataFrame(bot_users)
display(df_bot_users.head(2))

In [None]:
%%time
tweets = load_tweets(db.tweets)
tweets = [flatten(t) for t in tweets]
df_tweets = pd.DataFrame(tweets)

In [None]:
%%time   # getting human tweets
df_human_tweets = df_tweets[df_tweets['user_id'].isin(df_human_users['_id'])]
df_human_tweets.dropna(axis='index',inplace=True)
display(df_human_tweets.head(2))

In [None]:
%%time    # getting bot tweets
df_bot_tweets = df_tweets[df_tweets['user_id'].isin(df_bot_users['_id'])]
df_bot_tweets.dropna(axis='index',inplace=True)
display(df_bot_tweets.head(2))

In [None]:
%%time
"""
two subplots sharing Y-axis will show how many users there are per number of created interactions
we consider humans and bots users
"""
fig, axs = plt.subplots(1, 2, figsize=(16,3), sharey=True)

# first iteration for human tweets, second for bots
case='Humans'
j=0
for data in [df_human_tweets,df_bot_tweets]:
    # count the number of interactions generated by each user (per tweet type)
    counts = pd.DataFrame(data.groupby(['tweet_type','user_id']).agg({'user_id':'count'}))
    counts.columns = ['interactions']
    counts = pd.DataFrame(counts.to_records())
    
    # count the number of users by number of interactions (per tweet type)
    d = pd.DataFrame(counts.groupby(['tweet_type','interactions']).agg({'user_id':'count'}))
    d.columns = ['users_count']
    d = pd.DataFrame(d.to_records())

    # plot
    g = axs[j]
    g = sns.scatterplot(data=d, x='interactions', y='users_count', hue='tweet_type', ax=g)
    if case=='Humans':
        g.set_xlim(left=0, right=1000)
    else:
        g.set_xlim(left=0, right=600)
    g.set_title(case)
    g.set_yscale('log')
    #g.legend(loc='upper right', title="Tweet type")
    g.set_xlabel("Number of interactions")
    g.set_ylabel("Number of users")
    g.tick_params(axis='y',labelleft=True)
    case='Bots'
    j=j+1

# figure parameters
plt.tight_layout()
plt.savefig(GRAPHICS_DIR+"users-per-number-of-interactions" + FIG_EXTENSION, bbox_inches = "tight")
plt.show()

## Interactions between humans and bots

In [None]:
%%time
def load_users(user_collection):
    """Extracts the ObjectID and botscore of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    pipeline = [
                {
                    '$match': {
                        'scores': {
                            '$exists': True
                        }
                    }
                }, {
                    '$project': {
                        '_id': True, 
                        'botscore':'$scores.scores.universal'
                    }
                }
            ]
    print("Query", end=" ")
    users = user_collection.aggregate(pipeline, allowDiskUse=True)
    print("OK; List", end=" ")
    users = list(users)
    print("OK; Total users:", len(users))
    return users

def load_referencing_tweets(tweet_collection):
     """Extracts the ObjectID, tweet type, user id, retweeted/quoted user id and replied user id of retweets, replies and quotes
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    referencing_tweets = list(tweet_collection.find(
    filter={
        'tweet_type': 
                {'$in' : ['retweet',
                          'reply',
                          'quote']}},
    projection={'_id':1, 
                'tweet_type':1,
                'user_id':1, 
                'retweet_or_quote_user_id':1, 
                'in_reply_to_user_id':1},
    ))

    print("Referencing tweets extracted:",len(referencing_tweets))
    return referencing_tweets

In [None]:
%%time
users = load_users(db.users)
df_users = pd.DataFrame(users)
p75 = np.percentile(df_users.botscore,75)
p95 = np.percentile(df_users.botscore,95)
display(df_users.sample(3))

In [None]:
%%time
tweets = load_referencing_tweets(db.tweets)

In [None]:
%%time
"""
Each tweet is complemented with the botscore of the active user (user who creates the interaction) and the botscore of the passive user (user who creates the referenced interaction)
Note that, in this analysis, we should only consider those retweets, replies and quotes with known active and passive user botscore.
"""
df_tweets = pd.DataFrame(tweets)
df_tweets.columns = ['tweet_id','passive_id_reply','passive_id','tweet_type','active_id']
df_tweets.passive_id.fillna(df_tweets.passive_id_reply,inplace=True)
df_tweets.drop(columns=['passive_id_reply'],inplace=True)
df_tweets = df_tweets.join(df_users.set_index('_id'), on='passive_id')
df_tweets.columns = ['tweet_id','passive_id','tweet_type','active_id','passive_id_botscore']
df_tweets = df_tweets.join(df_users.set_index('_id'), on='active_id')
df_tweets.columns = ['tweet_id','passive_id','tweet_type','active_id','passive_id_botscore','active_id_botscore']
print('Refering interactions (replies,quotes,retweets):',len(df_tweets))
df_tweets = df_tweets.dropna(axis='index').reset_index(drop=True)
print('Refering interactions (replies,quotes,retweets) with botscore in source and destination:',len(df_tweets))

In [None]:
%%time  
"""
Given the active and passive botscores of each tweet, we derive the active and passive categories:
- A botscore from 0 to 75th percentile is mapped to human category
- A botscore from 75th percentile to 95th percentile is mapped to unclear category
- A botscore from 95th percentile to 1 is mapped to bot category
"""
df_tweets['active_user_category'] = pd.cut(df_tweets.active_id_botscore, 
                            bins=[0, p75, p95, 1], 
                            right=True, 
                            include_lowest=True, 
                            labels=['Human', 'Unclear', 'Bot'])


df_tweets['passive_user_category'] = pd.cut(df_tweets.passive_id_botscore, 
                            bins=[0, p75, p95, 1], 
                            right=True, 
                            include_lowest=True, 
                            labels=['Human', 'Unclear', 'Bot'])

In [None]:
# Interactions with uncertain users
print("Replies/Retweets/Quotes involving unclear users:",len(df_tweets[(df_tweets.active_user_category=='Unclear') | (df_tweets.passive_user_category=='Unclear')]))

In [None]:
# We remove those tweets involving uncertain users: we consider only human and bot interactions
df_tweets = df_tweets[df_tweets.active_user_category!="Unclear"]
df_tweets.active_user_category = df_tweets.active_user_category.astype(str).astype('category')
df_tweets = df_tweets[df_tweets.passive_user_category!="Unclear"]
df_tweets.passive_user_category = df_tweets.passive_user_category.astype(str).astype('category')
df_tweets.reset_index(drop=True, inplace=True)
print("Replies/Retweets/Quotes involving humans and bots users:",len(df_tweets))

In [None]:
# We calculate the total count and log count of the combinations: passive category, active category and tweet type
df_heatplot = df_tweets[['active_user_category','passive_user_category','tweet_type']]
df_heatplot['count'] = 1
df_heatplot = df_heatplot.groupby(['active_user_category','passive_user_category','tweet_type']).count()
df_heatplot['log_count'] = np.log10(df_heatplot['count'])
#df_heatplot.drop(columns="count",inplace=True)
df_heatplot.reset_index(inplace=True)
df_heatplot

In [None]:
def generate_heatmap(data,fmt,title):
    """
    Creates a heatmap 
    
    Keywords argument:
    data -- DataFrame with records
    fmt -- format of cell's annotations
    title -- title of the heatmap
    """
    ax = sns.heatmap(data=data,cmap=sns.light_palette("purple"),annot=True,linecolor='black',fmt=fmt)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    ax.set_xlabel("Passive user category")
    ax.set_ylabel("Active user category & tweet type")
    ax.set_title("Interaction volumes ("+title+")")

    boxes = [
        #left
        {'x': 0, 'y': 0, 'w': 1, 'h': 2, 'color': "#1b5e20"},
        {'x': 0, 'y': 2, 'w': 1, 'h': 2, 'color': "#1b5e20"},
        {'x': 0, 'y': 4, 'w': 1, 'h': 2, 'color': "#1b5e20"},
        #right
        {'x': 1, 'y': 0, 'w': 1, 'h': 2, 'color': "#1b5e20"},
        {'x': 1, 'y': 2, 'w': 1, 'h': 2, 'color': "#1b5e20"},
        {'x': 1, 'y': 4, 'w': 1, 'h': 2, 'color': "#1b5e20"},
    ]

    for box in boxes:
        ax.hlines(y=box['y'], xmin=box['x'], xmax=box['x']+box['w'], colors=box['color'], linewidth = 3)
        ax.hlines(y=box['y']+box['h'], xmin=box['x'], xmax=box['x']+box['w'], colors=box['color'], linewidth = 2)

        ax.vlines(x=box['x'], ymin=box['y'], ymax=box['y']+box['h'], colors=box['color'], linewidth = 3)
        ax.vlines(x=box['x']+box['w'], ymin=box['y'], ymax=box['y']+box['h'], colors=box['color'], linewidth = 3)

    #plt.tight_layout()
    plt.savefig(GRAPHICS_DIR + "interaction-volumes-" + title + FIG_EXTENSION, bbox_inches = "tight")
    plt.show()

In [None]:
# Dataframe management for plotting heatmaps
index=pd.MultiIndex.from_product([['Bot','Human'],['Bot','Human'],['quote','reply','retweet']],names=['Active','Passive','TweetType'])
df_heatplot.set_index(index,inplace=True)
df_heatplot.drop(columns=['active_user_category','passive_user_category','tweet_type'],inplace=True)
df_heatplot_count = pd.pivot_table(df_heatplot,values='count',index=['TweetType','Active'],columns=['Passive'])
df_heatplot_log = pd.pivot_table(df_heatplot,values='log_count',index=['TweetType','Active'],columns=['Passive'])

# plot heatmaps
generate_heatmap(df_heatplot_count,'d','count')
generate_heatmap(df_heatplot_log,'.2g','log')