## Configuration

### Imports

In [124]:
# Utilities
from IPython.display import display
from fastprogress import master_bar, progress_bar
import os
import ntpath
import numpy as np
import pandas as pd
from collections.abc import MutableMapping

# MongoDB functionality
from pymongo.errors import BulkWriteError
from pymongo import MongoClient, InsertOne
#from pymongo.bulk import BulkOperationBuilder# -> deprecated

# Indexes
import uuid
from bson import ObjectId

### Variables

In [125]:
# Directories where CSV data is stored
#ROOT_DIR = "/home/mattia/javier/botbusters-spanish-general-elections-network-analysis/"
ROOT_DIR = "/home/camilo/Documents/universidad/sem_8/forense/final/botbusters-spanish-general-elections-network-analysis/"
DATA_DIR = ROOT_DIR + "data/"

# Change path to root
os.chdir(ROOT_DIR)


# MongoDB parameters
mongoclient = MongoClient('localhost', 27017)
#db = mongoclient.influence
#db = mongoclient.test_spanish_election
db = mongoclient['test_spanish_election']

In [126]:
#read the collection myCollection of my mongodb
collection = db['myCollection']
x = collection.find_one()
print(x)



{'_id': ObjectId('6518b1d2700cf62b1e433e5e'), 'x': 1}


### Support Functions

In [127]:
def make_uuid(uuid_str):
    """Makes an UUID from string
    
    Keyword arguments:
    uuid_str -- uuid string to be converted into UUID
    """
    uuid_str = str(uuid_str)
    if not uuid_str.strip():
        return None
    try:
        return uuid.UUID(uuid_str)
    except Exception as ex:
        #print(uuid_str, ex)
        return None
    
def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    text = str(text)
    if not text.strip():
        return None
    try:
        return ObjectId(text.rjust(24,"0"))
    except Exception as ex:
        print(text, ex)
        return None
    
def df_to_mongodb(df, collection):
    """Saves the dataframe in a MongoDB collection

    Keyword arguments:
    df -- dataframe to dump
    collection -- MongoDB collection to fulfill
    """
    try:
 
        if df is None:
            return

        print("Preparing DB operations...", end=" ")
        records = df.to_dict('records')
        
        operations = []
        for record in progress_bar(records):
            operations.append(InsertOne(record))  

            if len(operations) > 20000:
                results = collection.bulk_write(operations)
                print("M:", str(results.matched_count).rjust(8, " "),
                      " I:", str(results.inserted_count).rjust(8, " "),
                      " U:", str(results.upserted_count).rjust(8, " "))
                operations = []

        if len(operations) > 0: 
            results = collection.bulk_write(operations)
            print("M:", str(results.matched_count).rjust(8, " "),
                  " I:", str(results.inserted_count).rjust(8, " "),
                  " U:", str(results.upserted_count).rjust(8, " "))

    except Exception as e:
        print("Exception. Message:", e)

## Load CSV in MongoDB

### Tweets collection

In [128]:
tweets_columns = {
    # tweets
    0: str,
    1: str,
    2: 'Int64',
    3: str,
    4: str,
    5: str
}

def read_tweets(filename):
    """Parses the tweets CSV returning a DataFrame.
    
    Keyword arguments:
    filename -- name of the CSV
    """
    print("Processing", ntpath.basename(filename), end="\t")

    df = pd.read_csv(filename, 
                     header=0,
                     sep=',',
                     low_memory=False, 
                     keep_default_na=True, 
                     dtype=tweets_columns, 
                     #usecols=[0,9,11,1,5,6,10],
                     usecols=[0,1,5,6,9,10],
                     #names=['_id', 'user_id', 'source', 'date', 'retweets', 'tweet_id', 'url'])
                     names=['_id', 'date', 'retweets', 'tweet_id', 'user_id', 'url'])   

    print("CSV", end=" ")

    display(df.head(5))
    print("#:", len(df), end=" entries; ")

    
    # # Discard URL entries
    # df = df[df.source=='twitter']
    # df.drop(columns=['source'],inplace=True)
    # print("#:", len(df), end=" entries; ")
    
    # Make index as UUID
    #df['_id'] = df['_id'].apply(make_uuid)
    print("#:", len(df), end=" entries; ")
    
    # # # Create ObjectIDs to avoid any potential issue
    # df['user_id'] = df['user_id'].apply(make_objid)
    # df['tweet_id'] = df['tweet_id'].apply(make_objid)
    # # print("#:", len(df), end=" entries; ")

    # df['user_id'] = df['user_id'].apply(make_uuid)
    # df['tweet_id'] = df['tweet_id'].apply(make_uuid)
    # print("#:", len(df), end=" entries; ")

    # Make datetime objects
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=False, format="%Y-%m-%d %H:%M:%S")
    print("#:", len(df), end=" entries; ")
    
    # Force integer
    df['retweets'] = df['retweets'].astype('int')
    print("#:", len(df), end=" entries; ")

    print("OK", end="; ")

    print("#:", len(df), end=" entries; ")

    return df

In [129]:
%%time
df_tweets = read_tweets(DATA_DIR+'dataset/quarter_tweets.csv')
display(df_tweets.head(5))
#display(df_tweets)
collection = db['tweets']
collection.delete_many({})
df_to_mongodb(df_tweets, collection)
#db.collection.insert_many(df_tweets.to_dict('records'))

Processing quarter_tweets.csv	

CSV 

Unnamed: 0,_id,date,retweets,tweet_id,user_id,url
0,0000003C572F4897B02ED634986A2FC0,2019-10-28T12:00:00.000Z,746,9FA5D424C04C4A1FAAE1F13CB48CEF3A,A145D4F25EF1453A82DA85E11660F875,0.078595
1,0000038959814291A01F22AD21B450E5,2019-10-08T12:00:00.000Z,974,94FFCF90FA3346FDB026D38C819499D8,637490CBEB4E4921BBE486F32A6ACC9D,0.049035
2,0000097D875B42A0B41993B7F4978057,2019-11-07T12:00:00.000Z,48,905885A62EE2469DB921591055E69076,C8A8AB4E56474B8C8904462CE93B9783,0.018845
3,000009C5DD7F4F67BD90F29AC8FAD12C,2019-10-14T12:00:00.000Z,1,,16BF765B2C51417780F3C9EB0F597EA4,0.018031
4,00000AE070FE4638A9D8F393A0B1A5B9,2019-11-10T12:00:00.000Z,1483,2817E3D23C0544309B9C142E710D3F56,986064C267C642C5B02A2DA0646E9D9B,0.121183


#: 582665 entries; #: 582665 entries; #: 582665 entries; #: 582665 entries; OK; #: 582665 entries; 

Unnamed: 0,_id,date,retweets,tweet_id,user_id,url
0,0000003C572F4897B02ED634986A2FC0,2019-10-28 12:00:00+00:00,746,9FA5D424C04C4A1FAAE1F13CB48CEF3A,A145D4F25EF1453A82DA85E11660F875,0.078595
1,0000038959814291A01F22AD21B450E5,2019-10-08 12:00:00+00:00,974,94FFCF90FA3346FDB026D38C819499D8,637490CBEB4E4921BBE486F32A6ACC9D,0.049035
2,0000097D875B42A0B41993B7F4978057,2019-11-07 12:00:00+00:00,48,905885A62EE2469DB921591055E69076,C8A8AB4E56474B8C8904462CE93B9783,0.018845
3,000009C5DD7F4F67BD90F29AC8FAD12C,2019-10-14 12:00:00+00:00,1,,16BF765B2C51417780F3C9EB0F597EA4,0.018031
4,00000AE070FE4638A9D8F393A0B1A5B9,2019-11-10 12:00:00+00:00,1483,2817E3D23C0544309B9C142E710D3F56,986064C267C642C5B02A2DA0646E9D9B,0.121183


Preparing DB operations... 

M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0 

### Users collection

In [130]:
users_columns = {
    # users
    0: str,
    1: str
}

def read_users(filename):
    """Parses the users CSV returning a DataFrame.
    
    Keyword arguments:
    filename -- name of the CSV
    """
    print("Processing", ntpath.basename(filename), end="\t")
    # df = pd.read_csv(filename, 
    #                  low_memory=False, 
    #                  keep_default_na=True, 
    #                  dtype=users_columns, 
    #                  names=['_id', 'username'])

    df = pd.read_csv(filename, 
                     header=0,
                     sep=',',
                     low_memory=False, 
                     keep_default_na=True, 
                     dtype=tweets_columns, 
                     usecols=[0,9],
                     names=['_id', 'username'])

    print("CSV", end=" ")

    # Make index as UUID
    #df['_id'] = df['_id'].apply(make_objid)
    print("OK", end="; ")
    print("#:", len(df), end=" entries; ")
    return df

In [133]:
%%time
df_users = read_users(DATA_DIR+'dataset/quarter_tweets.csv')
display(df_users.head(5))
collection = db['users']
collection.delete_many({})
df_to_mongodb(df_users, collection)

Processing quarter_tweets.csv	

CSV OK; #: 582665 entries; 

Unnamed: 0,_id,username
0,0000003C572F4897B02ED634986A2FC0,A145D4F25EF1453A82DA85E11660F875
1,0000038959814291A01F22AD21B450E5,637490CBEB4E4921BBE486F32A6ACC9D
2,0000097D875B42A0B41993B7F4978057,C8A8AB4E56474B8C8904462CE93B9783
3,000009C5DD7F4F67BD90F29AC8FAD12C,16BF765B2C51417780F3C9EB0F597EA4
4,00000AE070FE4638A9D8F393A0B1A5B9,986064C267C642C5B02A2DA0646E9D9B


Preparing DB operations... 

M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0 

### Hashtags collection

In [134]:
hashtags_columns = {
    # hashtags
    0: str,
    1: str
}

def read_hashtags(filename):
    """Parses the hashtags CSV returning a DataFrame.
    
    Keyword arguments:
    filename -- name of the CSV
    """
    print("Processing", ntpath.basename(filename), end="\t")
    # df = pd.read_csv(filename, 
    #                  low_memory=False, 
    #                  keep_default_na=True, 
    #                  dtype=hashtags_columns, 
    #                  names=['tweet_id', 'hashtag'])
    df = pd.read_csv(filename, 
                     header=0,
                     sep=',',
                     low_memory=False, 
                     keep_default_na=True, 
                     dtype=tweets_columns, 
                     usecols=[6,8],
                     names=['tweet_id', 'hashtag'])
    print("CSV", end=" ")

    # Make index
    #df['tweet_id'] = df['tweet_id'].apply(make_uuid)
    print("OK", end="; ")
    print("#:", len(df), end=" entries; ")
    return df

In [135]:
%%time
df_hashtags = read_hashtags(DATA_DIR+'dataset/quarter_tweets.csv')
display(df_hashtags.head(5))
collection = db['hashtags']
collection.delete_many({})
df_to_mongodb(df_hashtags, collection)

Processing quarter_tweets.csv	CSV OK; #: 582665 entries; 

Unnamed: 0,tweet_id,hashtag
0,9FA5D424C04C4A1FAAE1F13CB48CEF3A,retweet
1,94FFCF90FA3346FDB026D38C819499D8,retweet
2,905885A62EE2469DB921591055E69076,retweet
3,,original
4,2817E3D23C0544309B9C142E710D3F56,retweet


Preparing DB operations... 

M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0 

### Mentions collection

In [136]:
mentions_columns = {
    # hashtags
    0: str,
    1: str
}

def read_mentions(filename):
    """Parses the mentions CSV returning a DataFrame.
    
    Keyword arguments:
    filename -- name of the CSV
    """
    print("Processing", ntpath.basename(filename), end="\t")
    # df = pd.read_csv(filename, 
    #                  low_memory=False, 
    #                  keep_default_na=True, 
    #                  dtype=mentions_columns, 
    #                  names=['user_id','tweet_id'])
    df = pd.read_csv(filename, 
                     header=0,
                     sep=',',
                     low_memory=False, 
                     keep_default_na=True, 
                     dtype=tweets_columns, 
                     usecols=[9,7],
                     names=['user_id', 'tweet_id'])
    print("CSV", end=" ")

    # Make indexes
    #df['tweet_id'] = df['tweet_id'].apply(make_uuid)
    #df['user_id'] = df['user_id'].apply(make_objid)

    print("OK", end="; ")
    print("#:", len(df), end=" entries; ")
    return df

In [137]:
%%time
df_mentions = read_mentions(DATA_DIR+'dataset/quarter_tweets.csv')
display(df_mentions.head(5))
collection = db['mentions']
collection.delete_many({})
df_to_mongodb(df_mentions, collection)

Processing quarter_tweets.csv	CSV OK; #: 582665 entries; 

Unnamed: 0,user_id,tweet_id
0,81CF6091B67C4FED85DAD0602F27CE2C,A145D4F25EF1453A82DA85E11660F875
1,9D37C222FA614DCAB07D282769CFBAF2,637490CBEB4E4921BBE486F32A6ACC9D
2,120464FCFBD347C094721218B577062F,C8A8AB4E56474B8C8904462CE93B9783
3,,16BF765B2C51417780F3C9EB0F597EA4
4,AF38ED69FC164512B47720DF3C7A74CE,986064C267C642C5B02A2DA0646E9D9B


Preparing DB operations... 

M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0 

### Retweets collection

In [138]:
retweets_columns = {
    0: str,
    1: str,
    2: str,
    3: str
}

def read_retweets(filename):
    """Parses the retweets CSV returning a DataFrame.
    
    Keyword arguments:
    filename -- name of the CSV
    """
    print("Processing", ntpath.basename(filename), end="\t")
    # df = pd.read_csv(filename, 
    #                  low_memory=False, 
    #                  keep_default_na=True, 
    #                  dtype=retweets_columns, 
    #                  names=['_id', 'tweet_id', 'user_id', 'date']
    #                 )
    
    df = pd.read_csv(filename, 
                     header=0,
                     sep=',',
                     low_memory=False, 
                     keep_default_na=True, 
                     dtype=tweets_columns, 
                     usecols=[0,1,6,9],
                     names=['_id', 'date','tweet_id', 'user_id'])
    
    print("CSV", end=" ")

    # Make index
    #df['_id'] = df['_id'].apply(make_objid)
    #df['tweet_id'] = df['tweet_id'].apply(make_objid)
    #df['user_id'] = df['user_id'].apply(make_objid)
    
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=False, format="%Y-%m-%d %H:%M:%S")


    print("OK", end="; ")
    print("#:", len(df), end=" entries; ")
    return df

In [139]:
%%time
df_retweets = read_retweets(DATA_DIR+'dataset/quarter_tweets.csv')
display(df_retweets.head(5))
collection = db['retweets']
collection.delete_many({})
df_to_mongodb(df_retweets, collection)

Processing quarter_tweets.csv	CSV OK; #: 582665 entries; 

Unnamed: 0,_id,date,tweet_id,user_id
0,0000003C572F4897B02ED634986A2FC0,2019-10-28 12:00:00+00:00,9FA5D424C04C4A1FAAE1F13CB48CEF3A,A145D4F25EF1453A82DA85E11660F875
1,0000038959814291A01F22AD21B450E5,2019-10-08 12:00:00+00:00,94FFCF90FA3346FDB026D38C819499D8,637490CBEB4E4921BBE486F32A6ACC9D
2,0000097D875B42A0B41993B7F4978057,2019-11-07 12:00:00+00:00,905885A62EE2469DB921591055E69076,C8A8AB4E56474B8C8904462CE93B9783
3,000009C5DD7F4F67BD90F29AC8FAD12C,2019-10-14 12:00:00+00:00,,16BF765B2C51417780F3C9EB0F597EA4
4,00000AE070FE4638A9D8F393A0B1A5B9,2019-11-10 12:00:00+00:00,2817E3D23C0544309B9C142E710D3F56,986064C267C642C5B02A2DA0646E9D9B


Preparing DB operations... 

M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0  I:    20001  U:        0
M:        0 

## Check data coherence

In [140]:
def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

## Load CSV in MongoDB

### Tweets

In [141]:
def get_tweets(collection):
    """
    Gets tweets
    
    collection - Tweets MongoDB collection
    """
    tweets = list(collection.find({},
                                 {'_id' : True, 'user_id' : True, 'tweet_id' : True}))
    
    print("Number of tweets in DB:", len(tweets))
    tweets = [flatten(t) for t in tweets]
    df_tweets = pd.DataFrame(tweets)
    return df_tweets

In [142]:
%%time
df_tweets = get_tweets(db.tweets)
display(df_tweets.head(5))

Number of tweets in DB: 582665


Unnamed: 0,_id,tweet_id,user_id
0,0000003C572F4897B02ED634986A2FC0,9FA5D424C04C4A1FAAE1F13CB48CEF3A,A145D4F25EF1453A82DA85E11660F875
1,0000038959814291A01F22AD21B450E5,94FFCF90FA3346FDB026D38C819499D8,637490CBEB4E4921BBE486F32A6ACC9D
2,0000097D875B42A0B41993B7F4978057,905885A62EE2469DB921591055E69076,C8A8AB4E56474B8C8904462CE93B9783
3,000009C5DD7F4F67BD90F29AC8FAD12C,,16BF765B2C51417780F3C9EB0F597EA4
4,00000AE070FE4638A9D8F393A0B1A5B9,2817E3D23C0544309B9C142E710D3F56,986064C267C642C5B02A2DA0646E9D9B


CPU times: user 1.62 s, sys: 233 ms, total: 1.85 s
Wall time: 2.38 s


### Users

In [143]:
def get_users(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({},
                                 {'_id' : True}))
    
    print("Number of users in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    return df_users

In [144]:
%%time
df_users = get_users(db.users)
display(df_users.head(5))

Number of users in DB: 582665


Unnamed: 0,_id
0,0000003C572F4897B02ED634986A2FC0
1,0000038959814291A01F22AD21B450E5
2,0000097D875B42A0B41993B7F4978057
3,000009C5DD7F4F67BD90F29AC8FAD12C
4,00000AE070FE4638A9D8F393A0B1A5B9


CPU times: user 979 ms, sys: 3.33 ms, total: 982 ms
Wall time: 1.33 s


#### Checking tweets without users

In [145]:
%%time
tweets_without_users =  df_tweets[~df_tweets.user_id.isin(df_users._id)]
print(len(tweets_without_users))

582665
CPU times: user 365 ms, sys: 0 ns, total: 365 ms
Wall time: 369 ms


#### Checking users without tweets

In [146]:
%%time
users_without_tweets =  df_users[~df_users._id.isin(df_tweets.user_id)]
print(len(users_without_tweets))

582665
CPU times: user 161 ms, sys: 0 ns, total: 161 ms
Wall time: 160 ms


### Retweets

In [147]:
def get_retweets(collection):
    """
    Gets retweets
    
    collection - Retweets MongoDB collection
    """
    retweets = list(collection.find({},
                                 {'_id': False, 'tweet_id' : True, 'user_id' : True}))
    
    print("Number of retweets in DB:", len(retweets))
    retweets = [flatten(r) for r in retweets]
    df_retweets = pd.DataFrame(retweets)
    return df_retweets

In [148]:
%%time
df_retweets = get_retweets(db.retweets)
display(df_retweets.head(5))

Number of retweets in DB: 582665


Unnamed: 0,tweet_id,user_id
0,9FA5D424C04C4A1FAAE1F13CB48CEF3A,A145D4F25EF1453A82DA85E11660F875
1,94FFCF90FA3346FDB026D38C819499D8,637490CBEB4E4921BBE486F32A6ACC9D
2,905885A62EE2469DB921591055E69076,C8A8AB4E56474B8C8904462CE93B9783
3,,16BF765B2C51417780F3C9EB0F597EA4
4,2817E3D23C0544309B9C142E710D3F56,986064C267C642C5B02A2DA0646E9D9B


CPU times: user 1.47 s, sys: 43.5 ms, total: 1.51 s
Wall time: 1.91 s


#### Checking retweets without referenced tweet

In [149]:
%%time
retweets_without_tweet =  df_retweets[~df_retweets.tweet_id.isin(df_tweets.tweet_id)]
print(len(retweets_without_tweet))

0
CPU times: user 226 ms, sys: 47 µs, total: 226 ms
Wall time: 223 ms


#### Checking retweets without user

In [150]:
%%time
retweets_without_user =  df_retweets[~df_retweets.user_id.isin(df_users._id)]
print(len(retweets_without_user))

582665
CPU times: user 281 ms, sys: 0 ns, total: 281 ms
Wall time: 278 ms


### Mentions

In [151]:
def get_mentions(collection):
    """
    Gets mentions
    
    collection - Mentions MongoDB collection
    """
    mentions = list(collection.find({},
                                 {'_id': False, 'tweet_id' : True, 'user_id' : True}))
    
    print("Number of mentions in DB:", len(mentions))
    mentions = [flatten(m) for m in mentions]
    df_mentions = pd.DataFrame(mentions)
    return df_mentions

In [152]:
%%time
df_mentions = get_mentions(db.mentions)
display(df_mentions.head(5))

Number of mentions in DB: 582665


Unnamed: 0,user_id,tweet_id
0,81CF6091B67C4FED85DAD0602F27CE2C,A145D4F25EF1453A82DA85E11660F875
1,9D37C222FA614DCAB07D282769CFBAF2,637490CBEB4E4921BBE486F32A6ACC9D
2,120464FCFBD347C094721218B577062F,C8A8AB4E56474B8C8904462CE93B9783
3,,16BF765B2C51417780F3C9EB0F597EA4
4,AF38ED69FC164512B47720DF3C7A74CE,986064C267C642C5B02A2DA0646E9D9B


CPU times: user 1.28 s, sys: 48 ms, total: 1.33 s
Wall time: 1.72 s


#### Checking mentions without referenced tweet

In [153]:
%%time
mentions_without_tweet =  df_mentions[~df_mentions.tweet_id.isin(df_tweets._id)]
print(len(mentions_without_tweet))

582665
CPU times: user 346 ms, sys: 196 µs, total: 346 ms
Wall time: 340 ms


#### Checking mentions without user

In [154]:
%%time
mentions_without_user =  df_mentions[~df_mentions.user_id.isin(df_users._id)]
print(len(mentions_without_user))

582665
CPU times: user 217 ms, sys: 0 ns, total: 217 ms
Wall time: 214 ms


### Mentions

In [155]:
def get_hashtags(collection):
    """
    Gets mentions
    
    collection - Hashtags MongoDB collection
    """
    hashtags = list(collection.find({},
                                 {'_id': False, 'tweet_id' : True}))
    
    print("Number of hashtags in DB:", len(hashtags))
    hashtags = [flatten(h) for h in hashtags]
    df_hashtags = pd.DataFrame(hashtags)
    return df_hashtags

In [156]:
%%time
df_hashtags = get_hashtags(db.hashtags)
display(df_hashtags.head(5))

Number of hashtags in DB: 582665


Unnamed: 0,tweet_id
0,9FA5D424C04C4A1FAAE1F13CB48CEF3A
1,94FFCF90FA3346FDB026D38C819499D8
2,905885A62EE2469DB921591055E69076
3,
4,2817E3D23C0544309B9C142E710D3F56


CPU times: user 901 ms, sys: 68.6 ms, total: 970 ms
Wall time: 1.37 s


#### Checking hashtags without tweet

In [157]:
%%time
hashtags_without_tweet =  df_hashtags[~df_hashtags.tweet_id.isin(df_tweets._id)]
print(len(hashtags_without_tweet))

536965
CPU times: user 265 ms, sys: 0 ns, total: 265 ms
Wall time: 263 ms


## Other checkings

#### Freq of each user

In [158]:
freq = df_tweets.user_id.value_counts()

In [159]:
freq2 = df_retweets.user_id.value_counts()

In [160]:
freq2.head(10)

F70EAAA8F23A48CDA4785F63FF7C4F71    353
7EEB0131C0A84616886627F3E082AD0C    305
4D350C3C7EAA429ABBA65E8F1AC80A91    285
8BE7408EAA85416C99B172A8C921C31A    253
E954A7EC8B9848CF8FE6B1FC58F46DF3    245
5F5AF98400EF49308E183E3367C64941    230
19BC12B3E3174FBBBC01141F537E4227    224
833163C348D9465F9139EFF896F3AEC4    205
1E6F94B0E9DE40878589FA322FF08BE5    202
9B8A6701440D4E0C87643F4AF48D0501    200
Name: user_id, dtype: int64

In [161]:
fsum = freq.add(freq2, fill_value=0)
display(fsum.head(10))

F70EAAA8F23A48CDA4785F63FF7C4F71    706
7EEB0131C0A84616886627F3E082AD0C    610
4D350C3C7EAA429ABBA65E8F1AC80A91    570
8BE7408EAA85416C99B172A8C921C31A    506
E954A7EC8B9848CF8FE6B1FC58F46DF3    490
5F5AF98400EF49308E183E3367C64941    460
19BC12B3E3174FBBBC01141F537E4227    448
833163C348D9465F9139EFF896F3AEC4    410
1E6F94B0E9DE40878589FA322FF08BE5    404
9B8A6701440D4E0C87643F4AF48D0501    400
Name: user_id, dtype: int64

In [162]:
fsum.to_pickle(path=DATA_DIR+'users_freq.pickle')