# Phase 1. Data import

## Contents
- [Configuration](#Configuration)
  - [Imports](#Imports)
  - [Variables](#Variables)
  - [Support functions](#Support-functions)
- [Load SFMs' data in MongoDB](#Load-SFMs'-data-in-MongoDB)
  - [Preliminaries](#Preliminaries)
  - [Execute](#Execute)
-  [Complete the data](#Complete-the-data)
  - [Support functions](#Support-functions)
  - [Complete database with missing references](#Complete-database-with-missing-references)
  - [Remove tweets without references](#Remove-tweets-without-references)
- [Update users collection according to new database status](#Update-users-collection-according-to-new-database-status)
  - [Remove those users with no interactions](#Remove-those-users-with-no-interactions)
  - [Create the new users](#Create-the-new-users)

## Configuration

### Imports

In [None]:
# Utilities
from IPython.display import display
from fastprogress import master_bar, progress_bar
from datetime import datetime
from unidecode import unidecode
import os
import ntpath
import numpy as np
import statistics 
import re
import math
import random
import datetime
import numbers
from collections.abc import MutableMapping
import pandas as pd

# Botometer API
import botometer

# MongoDB functionality
from pymongo.errors import BulkWriteError
from pymongo import MongoClient, InsertOne, UpdateOne, DeleteOne
from pymongo.bulk import BulkOperationBuilder
from bson import ObjectId

### Variables

In [None]:
# Directories where CSV data is stored
ROOT_DIR = "ABOSLUTE_PATH_TO_ROOT_FOLDER"
DATA_DIR = ROOT_DIR + "data/"
# Change path to root
os.chdir(ROOT_DIR)

# Botometer and Twitter Keys for parallel processing
keys = {
     0: botometer.Botometer(wait_on_ratelimit=True, rapidapi_key='RAPID_API_KEY', **{'consumer_key':'TWITTER_DEV_CONSUMER_KEY', 'consumer_secret':'TWITTER_DEV_CONSUMER_SECRET'}),
     1: botometer.Botometer(wait_on_ratelimit=True, rapidapi_key='RAPID_API_KEY', **{'consumer_key':'TWITTER_DEV_CONSUMER_KEY', 'consumer_secret':'TWITTER_DEV_CONSUMER_SECRET'}),
}

# MongoDB parameters
mongoclient = MongoClient('IP_ADDRESS', PORT)
db = mongoclient.botbusters
# It will automatically create the tweets' and users' collections.

### Support functions

In [None]:
def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    text = str(text)
    if not text.strip():
        return None
    try:
        return ObjectId(text.rjust(24,"0"))
    except Exception as ex:
        print(text, ex)
        return None

def remove_retweet_text(tweet_type, text):
    """Unidecodes text of originals, replies and quotes
    Removes text from retweets
    
    Keyword arguments:
    tweet_type -- tweet type of the interaction being processed
    text -- text of the interaction
    """
    if tweet_type == 'retweet':
        return None
    else:
        return unidecode(text)


def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

## Load SFMs' data in MongoDB

### Preliminaries

Hardcoding the CSVs' header and types permits to bypass the validation heuristic, thus increasing processing speed.

In [None]:
# Columns to be imported
tweets_users_columns = {
    # tweets
    'text':                         str,
    'tweet_type':                   str,
    'favorite_count':               int,
    'in_reply_to_status_id':        str,
    'in_reply_to_user_id':          str,
    'lang':                         str,
    'place':                        str,
    'retweet_count':                int,
    'retweet_or_quote_id':          str,
    'retweet_or_quote_user_id':     str,
    'parsed_created_at':            str,
    # both tweets and timeline
    'id':                           str,   # tweet id
    'created_at':                   str,   # tweet date
    # users timeline
    'user_id':                      str,
    'user_screen_name':             str,
    'user_created_at':              str,   # this should be constant for each user
    'user_default_profile_image':   bool,
    'user_description':             str,
    'user_favourites_count':        int,
    'user_followers_count':         int,
    'user_friends_count':           int,
    'user_listed_count':            int,
    'user_location':                str,
    'user_name':                    str,
    'user_statuses_count':          int,
    'user_time_zone':               str,
    'user_urls':                    str,
    'user_verified':                bool,
}

# Columns to be ignored
drop_columns = {
    'tweet_url':                    str,
    'in_reply_to_screen_name':      str,
    'retweet_or_quote_screen_name': str,
    'source':                       str,
    'hashtags':                     str,
    'urls':                         str,
    'created_at':                   str,
    'coordinates':                  str,
    'media':                        str,
    'possibly_sensitive':           str,
}

In [None]:
def process_csv(filename):
    """Parses a CSV with SFM format returning two DataFrames with the user and tweet info.
    
    Keyword arguments:
    filename -- name of the CSV
    """
    print("Processing", ntpath.basename(filename), end="\t")

    df = pd.read_csv(filename, low_memory=False, keep_default_na=False, 
                     dtype=tweets_users_columns, 
                     usecols=list(tweets_users_columns.keys()))

    print("CSV", end=" ")

    # Make index as ObjectID
    df['_id'] = df['id'].apply(make_objid)
    df.set_index('_id', drop=False, inplace=True)
    df.drop(columns=['id'], inplace=True)

    # Create ObjectIDs to avoid any potential issue
    df['user_id'] = df['user_id'].apply(make_objid)
    df['retweet_or_quote_user_id'] = df['retweet_or_quote_user_id'].apply(make_objid)
    df['retweet_or_quote_id'] = df['retweet_or_quote_id'].apply(make_objid)
    df['in_reply_to_status_id'] = df['in_reply_to_status_id'].apply(make_objid)
    df['in_reply_to_user_id'] = df['in_reply_to_user_id'].apply(make_objid)

    # Make datetime objects
    df['created_at'] = pd.to_datetime(df['created_at'], infer_datetime_format=False, format="%a %b %d %H:%M:%S %z %Y")
    df['user_created_at'] = pd.to_datetime(df['user_created_at'], infer_datetime_format=False, format="%a %b %d %H:%M:%S %z %Y")

    df.drop(columns=['parsed_created_at'], inplace=True)

    # Remove text related to retweets
    df['text'] = np.vectorize(remove_retweet_text)(df.tweet_type, df['text'])

    # Make None instead of empty strings
    df = df.applymap(lambda x: None if not str(x).strip() else x)

    print("OK", end="; ")

    print("#:", len(df), end=" entries; ")

    # Separate tweet info from user info
    df_tweets = df[['_id', 'created_at', 'text', 'tweet_type', 'favorite_count', 
                    'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place', 
                    'retweet_count', 'retweet_or_quote_id', 'retweet_or_quote_user_id', 'user_id']]
    df_users = df[['_id', 'created_at','user_id', 'user_screen_name', 'user_created_at', 'user_default_profile_image', 
                   'user_description', 'user_favourites_count', 'user_followers_count', 'user_friends_count', 
                   'user_listed_count', 'user_location', 'user_name', 'user_statuses_count', 
                   'user_time_zone', 'user_urls', 'user_verified']]

    return df_tweets,df_users


def csv_to_mongodb(filename, tweet_collection, user_metadata_collection):
    """Saves a CSV with SFM format in MongoDB.

    Keyword arguments:
    filename -- name of the CSV
    user_metadata_collection -- MongoDB Users' Metadata Collection (containing users metadata)
    tweet_collection -- MongoDB Tweets' Collection  
    """
    try:
        df_tweets,df_users = process_csv(filename)
        
        display(df_tweets.head(5))
        display(df_users.head(5))

        if df_tweets is None or df_users is None:
            return

        print("Preparing DB operations...", end=" ")
        tweets_operations = []
        users_operations = []

        records = df_users.to_dict('records')
        for record in records:
            users_operations.append(UpdateOne({'_id': record['_id']},
                                                 {'$set': record},
                                                 upsert=True
                                                )
                                      )

        print("Users OK", end="; ")

        records = df_tweets.to_dict('records')
        for record in records:
            tweets_operations.append(UpdateOne({'_id': record['_id']},
                                                 {'$set': record},
                                                 upsert=True
                                            ))
        print("Tweets OK", end="; ")


        print("READY to BULK", end="; ")

        results = tweet_collection.bulk_write(tweets_operations)
        print("Tweets M:", str(results.matched_count).rjust(8, " "),
              " I:", str(results.inserted_count).rjust(8, " "),
              " U:", str(results.upserted_count).rjust(8, " "),
              end="; "
             )
        results = user_metadata_collection.bulk_write(users_operations)
        print("Users M:", str(results.matched_count).rjust(8, " "),
              " I:", str(results.inserted_count).rjust(8, " "),
              " U:", str(results.upserted_count).rjust(8, " ")
             )

    except Exception as e:
        print("Exception. Message:", e)

### Execute

In [None]:
# Save in MongoDB those CSVs with SFM format from any given folders
data_folders = ['harvester/processed/hashtags']
df = None
for folder in data_folders:
    for file in progress_bar(os.listdir(DATA_DIR + folder)):
        if file.endswith(".csv"):
            csv_to_mongodb(os.path.join(DATA_DIR + folder, file), db.tweets, db.users_metadata)

## Complete the data
Collect those tweets that belongs to the observation period and are referenced by collected tweets. Discard all those originated outside the observation period.

### Support functions

In [None]:
def get_referenced_tweets(tweet_collection):
    """Extracts the ObjectID of replies, quotes and originals
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    ids = list(tweet_collection.find({'tweet_type': {'$in':['reply','quote','original']}},
                                          {'_id': 1}))
    return ids

def get_referencing_tweets(tweet_collection):
    """Extracts the ObjectID of replies, quotes and retweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection
    """
    ids = list(tweet_collection.find({'tweet_type': {'$in':['reply','quote','retweet']}},
                                          {'_id': 1, 'tweet_type': 1, 'retweet_or_quote_id':1, 'in_reply_to_status_id':1, 'retweet_count':1}))
    return ids

def get_original_tweet(original_tweet_id, instance_number):
    """Consults a Status object through Twitter API.
    
    Keyword arguments:
    original_tweet_id -- Tweets' ObjectID
    instance_number -- the instance of keys to use
    """
    try:
        botometer_instance = keys[instance_number]
        consumer_key = botometer_instance.consumer_key
        consumer_secret = botometer_instance.consumer_secret
        auth = tweepy.AppAuthHandler(consumer_key,consumer_secret)
        api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
        result = api.get_status(id=original_tweet_id, 
                                include_entities=False,
                                tweet_mode='extended',
                                include_card_uri=False,
                                include_ext_alt_text=False,
                                include_my_retweet=False
                               )
        print(original_tweet_id,'original tweet found', end="; ")
        return result
    except Exception as e:
        if re.match('.*(144|63|34).*',e.response.text):
            print(original_tweet_id,'original tweet not found',end="; ")
            return False
        else:
            print(original_tweet_id,'original tweet not found',str(e), end="; ")
            return False

def process_tweet(result):
    """Processes a Status object, the result extracted from Twitter API, to parse info for our database format
    Returns a dictionary for both Tweet and Users' Metadata info.
    
    Keyword arguments:
    result -- Status object in json format received from Twitter API
    """                                                  
    created_at = pd.to_datetime(result.created_at, infer_datetime_format=False, format="%a %b %d %H:%M:%S %z %Y")
        
    if (created_at >= START_DATE) & (created_at <= END_DATE):

        result = result._json

        new_tweet = {}
        new_tweet['_id'] = make_objid(result['id'])
        new_tweet['created_at'] = created_at
        new_tweet['text'] = result['full_text']
        new_tweet['tweet_type'] = 'original'
        new_tweet['favorite_count'] = result['favorite_count']
        new_tweet['in_reply_to_status_id'] = make_objid(result['in_reply_to_status_id'])
        new_tweet['in_reply_to_user_id'] = make_objid(result['in_reply_to_user_id'])
        new_tweet['lang'] = result['lang']
        new_tweet['place'] = result['place']
        new_tweet['retweet_count'] = result['retweet_count']
        new_tweet['retweet_or_quote_id'] = None
        new_tweet['retweet_or_quote_user_id'] = None
        new_tweet['user_id'] = make_objid(result['user']['id'])


        if 'retweeted_status' in result:
            new_tweet['tweet_type'] = 'retweet'
            new_tweet['retweet_or_quote_id'] = make_objid(result['retweeted_status']['id'])
            new_tweet['retweet_or_quote_user_id'] = make_objid(result['retweeted_status']['user']['id'])
        elif 'quoted_status' in result:
            new_tweet['tweet_type'] = 'quote'
            new_tweet['retweet_or_quote_id'] = make_objid(result['quoted_status']['id'])
            new_tweet['retweet_or_quote_user_id'] = make_objid(result['quoted_status']['user']['id'])


        new_user_metadata = {}
        new_user_metadata['_id'] = make_objid(result['id'])
    
        user_result = result['user']
        new_user_metadata['created_at']=created_at
        new_user_metadata['user_created_at']= pd.to_datetime(user_result['created_at'], infer_datetime_format=False, format="%a %b %d %H:%M:%S %z %Y")
        new_user_metadata['user_default_profile_image']=user_result['default_profile_image']
        new_user_metadata['user_description']=user_result['description']
        new_user_metadata['user_favourites_count']=user_result['favourites_count']
        new_user_metadata['user_followers_count']=user_result['followers_count']
        new_user_metadata['user_friends_count']=user_result['friends_count']
        new_user_metadata['user_id']=make_objid(user_result['id'])
        new_user_metadata['user_listed_count']=user_result['listed_count']
        new_user_metadata['user_location']=user_result['location']
        new_user_metadata['user_name']=user_result['name']
        new_user_metadata['user_screen_name']=user_result['screen_name']
        new_user_metadata['user_statuses_count']=user_result['statuses_count']
        new_user_metadata['user_time_zone']=user_result['time_zone']
        new_user_metadata['user_url']=user_result['url']
        new_user_metadata['user_verified']=user_result['verified']
        
        return new_tweet,new_user_metadata
    else:
        print('original tweet not in date range:',result.created_at, end="; ")
        return (False,)

def add_tweet_and_user_to_mongodb(processed_tweet, tweet_collection, user_metadata_collection):
    """
    Adds a processed Status object response with our format in MongoDB.
    
    Keyword arguments:
    processed_tweet -- a tuple with both tweet and user metadata info (result of applying process_tweet)
    tweet_collection -- MongoDB Tweets' Collection  
    user_metadata_collection -- MongoDB Users' Metadata Collection (containing users metadata)
    """
    new_tweet = processed_tweet[0]
    new_user = processed_tweet[1]
    original_tweet_id = new_tweet['_id']

    results = tweet_collection.update_one({'_id':original_tweet_id},
                                   {'$set':new_tweet},
                                    upsert=True)

    #print("New tweet Ma:", str(results.matched_count).rjust(8, " ")," Mo:", str(results.modified_count).rjust(8, " "))

    results = user_metadata_collection.update_one({'_id':original_tweet_id},
                                           {'$set':new_user},
                                           upsert=True)
    #print("New user metadata Ma:", str(results.matched_count).rjust(8, " ")," Mo:", str(results.modified_count).rjust(8, " "))
    print('original tweet added (', new_tweet['created_at'],')')

def get_tweet_info(tweet):
    """
    Gets the tweet type, tweet id and referenced tweet id
    
    Keyword arguments:
    tweet -- tweet extracted from MongoDB
    """
    tweet_type = tweet['tweet_type']
    tweet_id = tweet['_id']
    
    if tweet_type in ['retweet','quote']:
        original_tweet_id = tweet['retweet_or_quote_id']
    elif tweet_type=='reply':
        original_tweet_id = tweet['in_reply_to_status_id']
    else:
        original_tweet_id = None
    
    return tweet_type, tweet_id, original_tweet_id

def get_referenced_id(tweet):
    """
    Gets the referenced id of the tweet. 
    If it is retweet or quote, retweet_or_quote_id is returned.
    If it is reply, in_reply_to_status_id is returned.
    Otherwise, if original, return None.
    
    Keyword arguments:
    tweet -- tweet extracted from MongoDB
    """
    tweet_type = tweet['tweet_type']
    tweet_id = tweet['_id']
    
    if tweet_type in ['retweet','quote']:
        original_tweet_id = tweet['retweet_or_quote_id']
    elif tweet_type=='reply':
        original_tweet_id = tweet['in_reply_to_status_id']
    else:
        original_tweet_id = None
    
    return original_tweet_id

### Complete database with missing references

<ol>
<li>If the referenced tweet is in our database --> OK</li>
<li>If the referenced tweet is not in our database --> collect it from Twitter:
    <ol type="a">
        <li>If it is not found (erased, private account, etc.) --> nothing</li>
        <li>If it is found, but out of our date range (10-04-2019 --- 10-11-2019) --> nothing</li>
        <li>It it is found and within our time window (10-04-2019 --- 10-11-2019):
            <ol type="i">
                <li>If it is a original tweet --> add to database</li>
                <li>If it is a reply or quote --> recursive call (1)</li>
            </ol>
        </li>
    </ol>
</li>
</ol>

In [None]:
# main function for completing database
def complete_references(tweet,key,original_ids_processed,original_ids):
    """
    Completes the references of a tweet in MongoDB by checking if retweets, quotes and replies referenced tweets are also in our database.
    
    1. If the referenced tweet is in our database --> OK
    2. If the referenced tweet is not in our database --> collect it from Twitter:
        2.a If it is not found (erased, private account, etc.) --> nothing
        2.b If it is found, but out of our date range (10-04-2019 --- 10-11-2019) --> nothing
        2.c It it is found and within our time window (10-04-2019 --- 10-11-2019):
            2.c.1 If it is a original tweet --> add to database
            2.c.2 If it is a reply or quote --> recursive call (1)
    
    Keyword arguments:
    tweet -- tweet extracted from MongoDB
    original_ids_processed -- list of Tweets' ObjectIDs just processed (cache for speed and efficiency)
    original_ids -- list of Tweets' ObjectIDs existing in database
    """
    tweet_type, tweet_id, original_tweet_id = get_tweet_info(tweet)
    
    # original tweet collected in previous interactions
    if original_tweet_id in original_ids_processed:
        return original_ids_processed[original_tweet_id]
    
    # original tweet in database
    if original_tweet_id in original_ids:
        original_ids_processed[original_tweet_id] = True
        return True
    
    # original tweet should be collected
    original_tweet = get_original_tweet(original_tweet_id,key)

    # the original tweet could not be collected
    if original_tweet is False:
        original_ids_processed[original_tweet_id] = False
        return False

    processed_tweet = process_tweet(original_tweet)

    # the original tweet is within our time window
    if processed_tweet[0] is False:
        original_ids_processed[original_tweet_id] = False
        return False

    # before adding the tweet, its references (in case of reply or quote) should be also completed
    referenced_tweet_type, referenced_tweet_id, referenced_original_tweet_id = get_tweet_info(processed_tweet[0])
    if referenced_tweet_type in ['reply','quote']:
        references_completed = complete_references(processed_tweet[0],key,original_ids_processed,original_ids)   # recursive method
    else:
        references_completed = True

    if references_completed:
        add_tweet_and_user_to_mongodb(processed_tweet, db.tweets, db.users_metadata)
        
    original_ids_processed[original_tweet_id] = references_completed

    return references_completed

In [None]:
# Get IDs of tweets that can be referenced (original, quote, replies)
referenced_tweets = get_referenced_tweets(db.tweets)
referenced_tweets = [rt['_id'] for rt in referenced_tweets]  # build a list with only ids
# Get IDs of tweets that can reference other tweets (retweet, quote, replies)
referencing_tweets = get_referencing_tweets(db.tweets)

In [None]:
original_ids_processed = {}   # cache list of processed tweets
key=0                         # key to use in Twitter API calls
iterations=0                  # counter of interactions for intermediate backups of the cache
number_of_keys = 25           # at least one

# complete references of referencing tweets (retwets, quotes and replies)
for tweet in progress_bar(referencing_tweets):
    complete_references(tweet,key,original_ids_processed,original_ids)
    key = (key+1)%number_of_keys  # iterate over available keys to manage API limits
    
    # as this is a slow process, we save from time to time the cache of the tweets already processed
    if iterations%100000==0:
        with open('ids_processed.pickle', 'wb') as handle:
            pickle.dump(original_ids_processed, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    iterations+=1

### Remove tweets without references

Once the previous steps have finished, tweets which still do not have their reference tweet are removed


In [None]:
# get referenced tweets and build DataFrame with associated ObjectIDs
referenced_tweets = get_referenced_tweets()
referenced_tweets = [rt['_id'] for rt in referenced_tweets]
df_referenced = pd.DataFrame(referenced_tweets)
df_referenced.columns = ['_id']

# get referencing tweets and build DataFrame with associated ObjectIDs and referenced ObjectIDs
referencing_tweets = get_referencing_tweets()
df_referencing = pd.DataFrame(referencing_tweets)
referencing_values = [get_referenced_id(rt) for rt in referencing_tweets]
df_referencing['referenced_id'] = referencing_values
df_referencing = df_referencing[['_id','referenced_id']]

In [None]:
# initialization
df_updated_referenced = df_referenced
df_updated_referencing = df_referencing

# check tweets without reference that should be removed
df_remove = df_updated_referencing[~df_updated_referencing['referenced_id'].isin(df_updated_referenced['_id'])]
tweets_without_reference = df_remove.shape[0]

iterations=0
# while database has got tweets without reference, we should iteratively remove tweets
while (tweets_without_reference > 0):    
    
    # remove tweets of df_remove
    operations = []
    for tweet_to_delete in df_remove._id.values:
        operations.append(DeleteOne({'_id': tweet_to_delete}))

    results = db.tweets.bulk_write(operations)
    print("tweets M:", str(results.matched_count).rjust(8, " "),
          " D:", str(results.deleted_count).rjust(8, " "), end='; ')
    results = db.users_metadata.bulk_write(operations)
    print("users metadata M:", str(results.matched_count).rjust(8, " "),
          " D:", str(results.deleted_count).rjust(8, " "))
    
    # update dataframes removing deleted tweets
    df_updated_referencing = df_updated_referencing[~df_updated_referencing['_id'].isin(df_remove['_id'])]
    df_updated_referenced = df_updated_referenced[~df_updated_referenced['_id'].isin(df_remove['_id'])]
    
    # we should check tweets without reference again (referenced tweets may be deleted in previous iteration)
    df_remove = df_updated_referencing[~df_updated_referencing['referenced_id'].isin(df_updated_referenced['_id'])]
    tweets_without_reference = df_remove.shape[0]
    print(tweets_without_reference)

    # update control variables
    iterations+=1
    
print('DONE!',iterations,"were necessary for database to recursively converge!")

## Update users collection according to new database status

In [None]:
def get_user_ids(user_collection):
    """
    Extracts the ObjectIDs of users
    
    Keyword arguments:
    user_collection -- MongoDB Users' Collection  
    """
    ids = list(user_collection.find({},{'_id': 1}))
    #ids = [i['_id'] for i in ids]
    print(len(ids),'users extracted!')
    return ids

def get_tweet_user_ids(tweet_collection):
    """
    Extracts the ObjectIDs of users that have created tweets
    
    Keyword arguments:
    tweet_collection -- MongoDB Tweets' Collection  
    """    
    
    pipeline = [
    {
        '$group': {
            '_id': '$user_id'
        }
    }
    ]
    
    ids = list(tweet_collection.aggregate(pipeline))
    #ids = [i['_id'] for i in ids]
    print(len(ids),'users with tweets extracted!')
    return ids

### Remove those users with no interactions
These users only had interactions that have been deleted, so they can be safely discarded

In [None]:
# get a list of users' ObjecIDs from user collection
user_ids = get_user_ids(db.users)
df_user_ids = pd.DataFrame(user_ids)

# get a list of users' ObjecIDs from tweet collection (authorships)
tweet_user_ids = get_tweet_user_ids(db.tweets)
df_tweet_user_ids = pd.DataFrame(tweet_user_ids)

# we should remove those users from user collection that finally do not have interactions
df_users_remove = df_user_ids[~df_user_ids['_id'].isin(df_tweet_user_ids['_id'])]
df_users_remove.info()

In [None]:
# update MongoDB
operations = []
for user_to_delete in df_users_remove._id.values:
    operations.append(DeleteOne({'_id': user_to_delete}))
results = db.users.bulk_write(operations)
print("users D:", str(results.deleted_count).rjust(8, " "))

### Create the new users

In [None]:
# get a list of users' ObjecIDs from user collection
user_ids = get_user_ids(db.users)
df_user_ids = pd.DataFrame(user_ids)

# get a list of users' ObjecIDs from tweet collection (authorships)
tweet_user_ids = get_tweet_user_ids(db.tweets)
df_tweet_user_ids = pd.DataFrame(tweet_user_ids)

# in this case, we add those new users that have been added due to the process of completing references
df_users_add = df_tweet_user_ids[~df_tweet_user_ids['_id'].isin(df_user_ids['_id'])]
df_users_add.info()

In [None]:
# update MongoDB
operations = []
for user_to_add in df_users_add._id.values:
    operations.append(UpdateOne({'_id': user_to_add}, 
                                {'$set': {'scores': -1}},
                                upsert=True))
results = db.users.bulk_write(operations)
print("users U:", str(results.upserted_count).rjust(8, " "))