# Phase 4. User classification

## Contents
- [Configuration](#Configuration)
  - [Imports](#Imports)
  - [Variables](#Variables)
  - [Support functions](#Support-functions)
- [Users' features](#Users'-features)
  - [Manual labels for training purposes](#Manual-labels-for-training-purposes)
  - [Map bots to evaluation format](#Map-bots-to-evaluation-format)
- [Ensemble Classifier](#Ensemble-Classifier)
  - [Preprocessing](#Preprocessing)
  - [Evaluation](#Evaluation)
  - [Graphical representation](#Graphical-representation)
- [Accounts projections](#Accounts-projections)
  - [Get humans and bots interactions](#Get-humans-and-bots-interactions)
  - [Projection features](#Projection-features)
- [Friendship graph](#Friendship-graph)
  - [Create nodes csv (Gephi input 1)](#Create-nodes-csv-(Gephi-input-1))
  - [Create edges csv (Gephi input 2)](#Create-edges-csv-(Gephi-input-2))

## Configuration

### Imports

In [None]:
# Imports for data management
from fastprogress import master_bar, progress_bar
import sys
from pymongo import MongoClient, UpdateOne
import os 
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import pickle
import csv
from collections.abc import MutableMapping
from bson import ObjectId

# Imports for plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=0.9)
sns.set_style("whitegrid")
sns.set_style({'font.family':'monospace'})

In [None]:
# If true exports vectorial PDFs instead of JPG.
VECTORIAL_FIGURES = True
FIG_EXTENSION = ".pdf" if VECTORIAL_FIGURES else ".jpg"

# Directories where CSV data is stored
ROOT_DIR = "ABOSLUTE_PATH_TO_ROOT_FOLDER"
DATA_DIR = ROOT_DIR + "data/"
GRAPHICS_DIR = ROOT_DIR + "graphics/"

# Change path to root
os.chdir(ROOT_DIR)

try:
    os.makedirs(GRAPHICS_DIR)
except FileExistsError:
    # directory already exists
    pass

# MongoDB parameters
mongoclient = MongoClient('IP_ADDRESS', PORT)
db = mongoclient.DATABASE_NAME

## Support Functions

In [None]:
def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    if text is None:
        return None
    text = str(text)
    if not text.strip():
        return None
    return ObjectId(text.rjust(24,"0"))


def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

## Users' features

### Manual labels for training purposes

In [None]:
def load_labeled_users(user_collection):
    """Extracts the ObjectID and political party of manually labeled users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    print("Query", end=" ")
    labeled_users = user_collection.find({'political_party':{'$exists':True}},
                                  {'_id':1,'political_party':1})
    print("OK; List", end=" ")
    labeled_users = list(labeled_users)
    print("OK; Total labeled users:", len(labeled_users))
    return labeled_users

def load_tweets(tweet_collection):
    """Extracts the ObjectID, tweet type, bag-of-words, sentiment score and user id of all tweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    print("Query", end=" ")
    tweets = tweet_collection.find({},
                            {'_id':0,'tweet_type':1,'keywords_summary':1,'sentiment_score':1,'user_id':1})
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets

In [None]:
%%time
labeled_users = load_labeled_users(db.users)
df_labeled_users = pd.DataFrame(labeled_users)
display(df_labeled_users.head(2))

In [None]:
df_labeled_users.political_party.unique()

In [None]:
%%time
tweets = load_tweets(db.tweets)
tweets = [flatten(t) for t in tweets]
df_tweets = pd.DataFrame(tweets)

In [None]:
# get tweets of manually labeled users by joining two DataFrames (df_labeled_users and df_tweets) through 'user_id' key
df_tweets = df_tweets[df_tweets['user_id'].isin(df_labeled_users['_id'])]
print("Tweets associated with labeled users:",len(df_tweets))
df_tweets = df_tweets.join(df_labeled_users.set_index('_id'), on='user_id') ## append political party to interaction
before = len(df_tweets)
df_tweets.dropna(axis='index',inplace=True)
print("Invalid interactions with missing information (multimedia content):",before-len(df_tweets))
display(df_tweets.head(2))

In [None]:
# rename columns
df_tweets.columns=['tweet_type', 'user_id', 'sentiment','vox', 'pp', 'cs','psoe', 'up',
                   'elections', 'exhumacion', 'cataluña', 'debates', 'abascaleh', 'political_party']

display(df_tweets.head(2))

In [None]:
%%time

''' Calculation of the vector of features for each manually labeled user
    That is, per tweet type, the average  sentiment  score  towards  any  given  political  party
    and  toward  any  subject  thematic in combination  with any political party
'''

parties = ['vox', 'pp', 'cs', 'psoe', 'up']
themes = ['elections', 'exhumacion', 'cataluña', 'debates', 'abascaleh']
vectors = []   # vector of features per user

# we iterate over the group of interactions of each manually labeled user
for user, user_group in df_tweets.groupby('user_id'):
    
    # add user_id to vector
    user_features = {'user_id':user}
    
    # add initial features; by default, sentiments are set to 0
    for typ in ['original','retweet','quote','reply']:
        for p in parties:
            # tweet type + party theme
            user_features[typ+"-"+p] = 0.0
            for t in themes:
                # tweet type + party theme + subject
                user_features[typ+"-"+p+"-"+t] = 0.0
    
    # add users' political party to vector --> CLASS, target variable for training process
    political_party = user_group.political_party.unique()[0]
    user_features['political_party'] = political_party
    
    
    # now, complete vector with features

    # group interactions of the user per tweet type
    for tweet_type, tweet_type_group in user_group.groupby('tweet_type'):
        
        # group tweet_type interactions per mentioned party
        for p in parties:
            party_group = tweet_type_group[tweet_type_group[p]==True]
            
            # only a single party should be mentioned, remaining parties shoud not match
            for p2 in parties:
                if p2!=p:
                    party_group = party_group[party_group[p2]==False]
            
            # the mean of the sentiments towards a single party (per tweet type) is calculated (without considering possible thematics)
            # in other words, the mean(tweet type, party)
            without_theme_group = party_group
            if without_theme_group.shape[0] > 0:
                mean = without_theme_group['sentiment'].mean()
                user_features[tweet_type+"-"+p] = mean
            
            # then, despite considering the mention to a party, the mean of the sentiments towards each theme is extracted
            # in other words, the mean(tweet type, party, theme) of the user is calculated
            for theme in themes:
                
                # get those tweets mentioning one theme
                theme_group = party_group[party_group[theme]==True]
                if theme_group.shape[0] > 0:
                    mean = theme_group['sentiment'].mean()
                    user_features[tweet_type+"-"+p+"-"+theme] = mean
    
    # vector of features of the user is kept
    vectors.append(user_features)

In [None]:
# vectors of features are converted into a DataFrame
df_classifier = pd.DataFrame.from_dict(vectors,orient='columns')
display(df_classifier.head(5))
df_classifier.info()

In [None]:
# the training sample is kept with 200 users per political party
df_csv = df_classifier.groupby(by=['political_party']).head(200)
df_csv.to_csv(DATA_DIR+"classification/labeled_users_features.csv", index=False)

### Map bots to evaluation format

In [None]:
def load_bot_users(user_collection):
    """Extracts the ObjectID of bot users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    p95 = 0.6908019160064479
    print("Query", end=" ")
    bot_users = user_collection.find({'scores.scores.universal':{'$gte':p95}},
                                  {'_id':1})
    print("OK; List", end=" ")
    bot_users = list(bot_users)
    print("OK; Total bot users:", len(bot_users))
    return bot_users

def load_tweets(tweet_collection):
    """Extracts the ObjectID, tweet type, bag-of-words, sentiment score and user id of all tweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    print("Query", end=" ")
    tweets = tweet_collection.find({},
                            {'_id':0,'tweet_type':1,'keywords_summary':1,'sentiment_score':1,'user_id':1})
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets

In [None]:
%%time
bot_users = load_bot_users(db.users)
df_bot_users = pd.DataFrame(bot_users)

In [None]:
%%time
tweets = load_tweets(db.tweets)
tweets = [flatten(t) for t in tweets]
df_tweets = pd.DataFrame(tweets)

In [None]:
%%time
# get bot tweets by joining tweets and bot dataframes through user id
df_tweets.dropna(axis='index',inplace=True)
print("Tweets with sentiment and BoW:", len(df_tweets))
df_tweets = df_tweets[df_tweets['user_id'].isin(df_bot_users['_id'])]
print("Tweets associated with bots users:",len(df_tweets))
print("Bots with valid interactions:",len(df_tweets.user_id.unique()))

In [None]:
# rename of columns
df_tweets.columns=['tweet_type', 'user_id', 'sentiment','vox', 'pp', 'cs', 'psoe', 'up',
                   'elections', 'exhumacion', 'cataluña', 'debates', 'abascaleh']

display(df_tweets.head(2))

In [None]:
%%time

''' Calculation of the vector of features for each bot user
    That is, per tweet type, the average  sentiment  score  towards  any  given  political  party
    and  toward  any  subject  thematic in combination  with any political party
'''

parties = ['vox', 'pp', 'cs', 'psoe', 'up']
themes = ['elections', 'exhumacion', 'cataluña', 'debates', 'abascaleh']
vectors = []      # vector of features per bot

# we iterate over the group of interactions of each manually labeled user
for user, user_group in df_tweets.groupby('user_id'):
    
    # we add user id to vector 
    user_features = {'user_id':user}
    
    # add initial features; by default, sentiments are set to 0
    for typ in ['original','retweet','quote','reply']:
        for p in parties:
            # tweet type + party theme
            user_features[typ+"-"+p] = 0.0
            for t in themes:
                # tweet type + party theme + subject
                user_features[typ+"-"+p+"-"+t] = 0.0
               
    # now, complete vector with features

    # group interactions of the user per tweet type
    for tweet_type, tweet_type_group in user_group.groupby('tweet_type'):
        
        # group tweet_type interactions per mentioned party
        for p in parties:
            party_group = tweet_type_group[tweet_type_group[p]==True]
            
            # only a single party should be mentioned, remaining parties shoud not match
            for p2 in parties:
                if p2!=p:
                    party_group = party_group[party_group[p2]==False]
                    
            # the mean of the sentiments towards a single party (per tweet type) is calculated (without considering possible thematics)
            # in other words, the mean(tweet type, party)
            without_theme_group = party_group
            if without_theme_group.shape[0] > 0:
                mean = without_theme_group['sentiment'].mean()
                user_features[tweet_type+"-"+p] = mean
                
            # then, despite considering the mention to a party, the mean of the sentiments towards each theme is extracted
            # in other words, the mean(tweet type, party, theme) of the user is calculated
            for theme in themes:
                
                # get those tweets mentioning one theme
                theme_group = party_group[party_group[theme]==True]
                if theme_group.shape[0] > 0:
                    mean = theme_group['sentiment'].mean()
                    user_features[tweet_type+"-"+p+"-"+theme] = mean
                    
    # vector of features of the user is kept
    vectors.append(user_features)

In [None]:
# vectors of features are converted into a DataFrame
df_classifier = pd.DataFrame.from_dict(vectors,orient='columns')
display(df_classifier.head(5))
df_classifier.info() 

In [None]:
%%time
''' 
Bots are filtered to consider those with, at least, one direct interaction towards a political party
'''

def check_direct_mention_political_parties(bot_vector):
    ''' Returns True if at least one direct interaction towards a political party exists, False otherwise
    
    Keywords arguments:
    bot_vector -- the vector of features of a bot (a dictionary containing means of sentiments)
    '''
    
    # features regarding direct interaction with political party themes
    political_party_interaction_features = ['original-vox','reply-vox','quote-vox','retweet-vox',
                                        'original-pp','reply-pp','quote-pp','retweet-pp',
                                        'original-cs','reply-cs','quote-cs','retweet-cs',
                                        'original-up','reply-up','quote-up','retweet-up',
                                        'original-psoe','reply-psoe','quote-psoe','retweet-psoe']
    
    for ppif in political_party_interaction_features:
        # if a mean different from default value is found, the bot is valid for us
        if float(bot_vector[ppif]) != 0:
            return True
        
    # all means regarding direct interactions towards party themes are 0 (default value)
    # the bot is not valid for us
    return False

# feature vectors of bots are filtered
print("Analyzed bot users:",len(df_classifier))
df_classifier_filtered = df_classifier[df_classifier.apply(check_direct_mention_political_parties, axis=1)]
df_classifier_filtered.reset_index(inplace=True)
print("Filtered bot users (at least one directed mention to political parties):",len(df_classifier_filtered))

In [None]:
# bots for prediction are kept
df_classifier_filtered.to_csv(DATA_DIR+'classification/bots_features.csv',index=False)

## Ensemble Classifier

### Preprocessing

In [None]:
'''
Preprocessing of classifier ensemble results, considering the output of six models:
Random Forest (RF), Multilayer perceptron (NN), Support Vector Machine (SVM), Naive Bayes (NB), k-Nearest Neighbor (kNN), and AdaBoost (AB)

Each row contains the result of a bot classification: the user id, the predicted party per model (that is, the model with more probability) 
and the probability of belonging to each party per model.
'''
df_probabilities = pd.read_csv(DATA_DIR+"classification/bots_probabilities.csv", low_memory=False)
df_probabilities.head(2)

In [None]:
df_probabilities.columns

In [None]:
# rename of columns
models = ["6. AB", "5. kNN", "4. NB", "3. SVM", "2. NN", "1. RF"]
parties = ['ciudadanos','pp','psoe','up','vox']
rename_columns=['user_id']
for model in models:
    rename_columns.append(model)
for model in models:
    for party in parties:
        rename_columns.append(model+"-"+party)
df_probabilities.columns = rename_columns

In [None]:
'''
for each user, the mean of the models for each party is calculated
we add 5 columns with the name of the parties which contain that mean
'''
for party in parties:
    probabilities = []
    for model in models:
        probabilities.append(model+"-"+party)
    df_probabilities[party]=df_probabilities[probabilities].astype(float).mean(1)
    
df_probabilities.head(2)

### Evaluation

In [None]:
def get_political_party(party_probabilities,threshold=4/5):
    ''' Gets the predicted political party of a bot. In particular. returns:
    a) The polical party with the highest probability if is bigger than threshold
    b) The couple of two political parties if the sum of their probabilities is higher than threshold
    c) Unknown otherwise, meaning that it is not clearly classified in one or two parties over the threshold
    
    Keywords arguments:
    party_probabilities -- a dictionary with the probabilities of each political party
    threshold -- the value to determine the final political inclination depending on a), b) or c)
    '''
    # parse probabilities
    x = {'PP':party_probabilities.pp,
         'PSOE':party_probabilities.psoe,
         'VOX':party_probabilities.vox,
         'UP':party_probabilities.up,
         'Ciudadanos':party_probabilities.ciudadanos}
    
    # the probabilities are sorted in ascending mode
    x = {k: v for k, v in sorted(x.items(), key=lambda item: item[1])}

    # case a)
    if list(x.values())[4] >= threshold:
        return list(x.keys())[4]
    # case b)
    elif list(x.values())[4] + list(x.values())[3] >= threshold:
        return list(x.keys())[4]+"-"+list(x.keys())[3]
    # case c)
    else:    
        return "Unclear"

In [None]:
# a new column is added containing the predicted party considering the six models and a confidence threshold
df_probabilities['predicted'] = df_probabilities.apply(get_political_party,axis=1,args=(4/5,)).astype('category')
df_probabilities.head(2)

In [None]:
# number of bots in each possible combination of prediction
df_probabilities.groupby('predicted')['user_id'].nunique()

In [None]:
# Save party predictions in MongoDB
operations=[]
for user_id,political_party in zip(df_probabilities['user_id'],df_probabilities['predicted']):
    operations.append(UpdateOne({'_id': make_objid(user_id)}, 
                 {'$set': {'bot_political_party': political_party}},
                 upsert=False
                ))
    
print("Inserting political party of",len(operations),"bots...")

if len(operations) > 0:
    results = db.users.bulk_write(operations)
    print("M:", str(results.matched_count).rjust(8, " "),
              " I:", str(results.inserted_count).rjust(8, " "),
              " U:", str(results.upserted_count).rjust(8, " "))

### Graphical representation

In [None]:
# drop unclear
before = len(df_probabilities)
df_probabilities = df_probabilities[df_probabilities.predicted != 'Unclear']   
print("Bots with unclear ideology:", before-len(df_probabilities))
df_probabilities.predicted.cat.remove_unused_categories(inplace=True)

In [None]:
'''
Plot heatmaps of those bots with one or two predicted political parties. In particular:

a) Square heatmap:  summing up the couples partyA-partyB with partyB-partyA 
b) Triangular heatmap: differenting between partyA-partyB and partyB-partyA
'''
for triangular in [False,True]:    
    
    df_heatmap = df_probabilities
    
    # in triangular version, we exchange factors in some couples to fit them to their homologous
    if triangular:
        replace = {'VOX-Ciudadanos':'VOX-Ciudadanos',
               'Ciudadanos-VOX':'VOX-Ciudadanos',
               'PSOE-UP': 'UP-PSOE',
               'UP-PSOE': 'UP-PSOE',
               'VOX-UP': 'VOX-UP',
               'UP-VOX': 'VOX-UP',        
               'UP-Ciudadanos': 'UP-Ciudadanos',
               'Ciudadanos-UP': 'UP-Ciudadanos',
               'VOX-PP':'VOX-PP',
               'PP-VOX':'VOX-PP',
               'UP-PP':'UP-PP',
               'PP-UP':'UP-PP',
               'Ciudadanos-PSOE':'PSOE-Ciudadanos',
               'PSOE-Ciudadanos':'PSOE-Ciudadanos',
               'Ciudadanos-PP':'PP-Ciudadanos',
               'PP-Ciudadanos':'PP-Ciudadanos',
               'PSOE-VOX':'VOX-PSOE',
               'VOX-PSOE':'VOX-PSOE', 
               'PSOE-PP':'PSOE-PP',
               'PP-PSOE':'PSOE-PP'
              }
        df_heatmap['predicted'] = df_heatmap.predicted.replace(replace)
    
    # count number of bots per predicted political party
    df_heatmap = df_heatmap.groupby('predicted').agg({'user_id':'count'}) #, 'pp':'mean', 'psoe':'mean', 'up':'mean', 'vox':'mean', 'ciudadanos':'mean'})
    df_heatmap.columns = ['num_bots']
    df_heatmap.reset_index(inplace=True)

    # transform single column (party1-party2) in two columns (party1,party2) 
    df_heatmap[['party1','party2']] = df_heatmap['predicted'].str.split("-",expand=True)
    df_heatmap.drop(columns=['predicted'],inplace=True)
    df_heatmap.party2.fillna(df_heatmap.party1,inplace=True) # if party2 is none (because bot has only a single political party), copy party1 column in party2 column
    
    # draw heatmap
    ax = sns.heatmap(df_heatmap.pivot(columns='party2', index='party1', values='num_bots'),cmap=sns.light_palette("purple"),annot=True,fmt="g")
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    if triangular:
        ax.set_xlabel("Political party")
        ax.set_ylabel("Political party")
    else:
        ax.set_xlabel("Secondary political party")
        ax.set_ylabel("Primary political party")
        
    ax.set_title("Volumes of bots per political parties")
    if triangular:
        plt.savefig(GRAPHICS_DIR + "triangular-filtered-bots-per-political-party" + FIG_EXTENSION, bbox_inches = "tight")
    else:
        plt.savefig(GRAPHICS_DIR + "square-filtered-bots-per-political-party" + FIG_EXTENSION, bbox_inches = "tight")
    plt.show()

## Accounts projections

In [None]:
def load_labeled_users(user_collection):
    """Extracts the ObjectID and political party of manually labeled users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    print("Query", end=" ")
    labeled_users = user_collection.find({'political_party':{'$exists':True}},
                                  {'_id':1,'political_party':1})
    print("OK; List", end=" ")
    labeled_users = list(labeled_users)
    print("OK; Total labeled users:", len(labeled_users))
    return labeled_users

def load_bot_users(user_collection):
    """Extracts the ObjectID and predicted political partyof bot users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection
    """
    print("Query", end=" ")
    bot_users = user_collection.find({'bot_political_party':{'$exists':True}},
                                  {'_id':1,'bot_political_party':1})
    print("OK; List", end=" ")
    bot_users = list(bot_users)
    print("OK; Total bot users:", len(bot_users))
    return bot_users

def load_tweets(tweet_collection):
    """Extracts the bag-of-words, sentiment score and user id of all tweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    print("Query", end=" ")
    tweets = tweet_collection.find({},
                            {'_id':0,'keywords_summary':1,'sentiment_score':1,'user_id':1})
    print("OK; List", end=" ")
    tweets = list(tweets)
    print("OK; Total tweets:", len(tweets))
    return tweets

### Get humans and bots interactions

In [None]:
%%time
labeled_users = load_labeled_users(db.users)
df_labeled_users = pd.DataFrame(labeled_users)
display(df_labeled_users.head(2))

In [None]:
%%time
bot_users = load_bot_users(db.users)
df_bot_users = pd.DataFrame(bot_users)
df_bot_users = df_bot_users[df_bot_users.bot_political_party.isin(['VOX','Ciudadanos','PP','PSOE','UP'])]  # only bots with one predicted party are considered
df_bot_users.columns = ['_id','political_party']
display(df_bot_users.head(2))

In [None]:
%%time
tweets = load_tweets(db.tweets)
tweets = [flatten(t) for t in tweets]
df_tweets = pd.DataFrame(tweets)

In [None]:
# we get the tweets of manually labeled users and append the associated political party to them
df_labeled_tweets = df_tweets[df_tweets['user_id'].isin(df_labeled_users['_id'])]
print("Tweets associated with labeled users:",len(df_labeled_tweets))
df_labeled_tweets = df_labeled_tweets.join(df_labeled_users.set_index('_id'), on='user_id') ## append political party to interaction
before = len(df_labeled_tweets)
df_labeled_tweets.dropna(axis='index',inplace=True)
display(df_labeled_tweets.head(2))

In [None]:
# we get the tweets of bots users and append the predicted political party to them
df_bot_tweets = df_tweets[df_tweets['user_id'].isin(df_bot_users['_id'])]
print("Tweets associated with bot users:",len(df_bot_tweets))
df_bot_tweets = df_bot_tweets.join(df_bot_users.set_index('_id'), on='user_id') ## append political party to interaction
before = len(df_bot_tweets)
df_bot_tweets.dropna(axis='index',inplace=True)
display(df_bot_tweets.head(2))

### Projection features

In [None]:
def get_projection_features(df_tweets):
    '''
    Gets the mean of sentiment towards each political party per user 
    
    Keyword arguments:
    df_tweets -- a DataFrame with tweets interactions
    '''
    parties = ['VOX', 'PP', 'Ciudadanos', 'PSOE', 'UP']
    projection_vectors = []
    
    for user, user_group in df_tweets.groupby('user_id'):
        
        political_party = user_group.iloc[0]['political_party']   
        user_vector = {'user_id':user,
                        'political_party':political_party}
        
        for p in parties:
            # by default, the sentiment towards a political party is 0
            user_vector[p] = 0.0

            # in party_group resides the tweets of user mentioning party, at least, p
            party_group = user_group[user_group[p]==True]
            
            # we filter party_group to remove those tweets that mentions p and another party p2
            for p2 in parties:
                if p2!=p:
                    party_group = party_group[party_group[p2]==False]

            # the mean of sentiment of tweets mentioning p is calculated
            if len(party_group) > 0:
                mean = party_group['sentiment'].mean()
                user_vector[p] = mean
        
    
        projection_vectors.append(user_vector)

    # we return a DataFrame where each row contains the user id, the political party and five means of sentiment associated with parties
    return pd.DataFrame.from_dict(rows,orient='columns')

In [None]:
%%time
# we get the feature vectors of bots for projection
df_bot_tweets.columns=['user_id', 'sentiment','VOX', 'PP', 'Ciudadanos', 'PSOE', 'UP','elections', 'exhumacion', 'cataluña', 'debates', 'abascaleh','political_party']
df_bot_tweets.drop(['elections', 'exhumacion', 'cataluña', 'debates', 'abascaleh'],axis='columns',inplace=True)
df_bot_features = get_projection_features(df_bot_tweets)
df_bot_features['category'] = 'bot'   # we add the category of 'bot' to these users
display(df_bot_features.head(2))

In [None]:
%%time
# we get the feature vectors of manually labeled users for projection
df_labeled_tweets.columns=['user_id','sentiment','VOX','PP', 'Ciudadanos', 'PSOE', 'UP', 'elections', 'exhumacion', 'cataluña', 'debates', 'abascaleh','political_party']
df_labeled_tweets.drop(['elections', 'exhumacion', 'cataluña', 'debates', 'abascaleh'],axis='columns',inplace=True)
df_labeled_features = get_projection_features(df_labeled_tweets)
df_labeled_features['category'] = 'manually labeled'     # we add the category of 'manually labeled' to these users
display(df_labeled_features.head(2))

In [None]:
print("Manually labeled users:",len(df_labeled_features))
print("Bots users:",len(df_bot_features))

In [None]:
# bots and manually labeled users are joint before plotting
df_unified = df_bot_features.append(df_labeled_features,ignore_index=True)
df_unified['political_party'].unique()

In [None]:
df_unified.head(2)

In [None]:
# vectors are saved
df_unified.to_csv(DATA_DIR+'/classification/projection.csv',index=False)

## Friendship graph

In [None]:
def get_bot_followings(user_collection):
    """
    Extracts the ObjectID, predicted party and followings of bot users with only
    one identified political inclination. 
    
    Keyword arguments:
    user_collection - MongoDB Users' collection
    """
    
    print("Query", end=" ")
    bot_friendships = user_collection.find(
        {'bot_political_party': {'$exists':True},'friends': {'$exists':True}}, 
        {'_id': 1, 'friends': 1, 'bot_political_party': 1})
         
    print("OK; List", end=" ")
    bot_friendships = list(bot_friendships) 
    print("OK; Total bots:", len(bot_friendships), end="; ")

    df_bots = pd.DataFrame(bot_friendships)
    df_bots.columns = ['Id','Followings','Party']
    df_bots = df_bots[df_bots.Party.isin(['VOX','UP','Ciudadanos','PP','PSOE'])]
    df_bots.set_index('Id',inplace=True)
    print("Bots of one-affinity party with friends:", len(df_bots))
    
    return df_bots

In [None]:
%%time
bot_friendships = get_bot_followings(db.users)

### Create nodes csv (Gephi input 1)

In [None]:
# Creation of a CSV with the nodes of the graph. We employ the columns "ID" (user id, the identificator) and "Party" (political party, the attribute for coloring)
bot_friendships[['Party']].to_csv(DATA_DIR+"gephi/nodes.csv",index=True)

### Create edges csv (Gephi input 2)

In [None]:
%%time
'''Creation of a CSV with the edges of the graph. Each row represents an edge, that is, a particular following
Each row has the column "Source" (the following user) and "Target" (the followed user)
'''
with open(DATA_DIR+'gephi/edges.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Source','Target'])
    
    # we iterate through the bot_friendships DataFrame, considering the followings of each user (user_id)
    for user_id,row in progress_bar(list(bot_friendships.iterrows())):
        print("Processing", user_id, end="; ")
        
        # we get the followings of the user
        followings = pd.DataFrame(row[0],columns=['id'])
        print("Total followings:",len(followings), end="; ")
        
        # we filter them to get only those who are bots with only a political party
        followings = followings[followings.id.isin(bot_friendships.index)]
        print("Bot followings:",len(followings))
        
        # for each following, a new row in the CSV should be created (that is, an edge)
        for following in followings['id']:
            writer.writerow([user_id,following])