In [1]:
import numpy as np
import pandas as pd
import itertools
import json
import os
import re
import time
import pickle
from collections import Counter

# Network data pre-processing
## **SNLP team project**
In this notebook we construct the network representation.

In [2]:
def pre_process_dataframe(data):    
    # Restructure tweets
    data['id_str']                  = data['id_str'].astype(str)
    data['in_reply_to_user_id_str'] = data['in_reply_to_user_id_str'].astype(str)
    data['user_id_str']             = [row['id_str'] for row in data['user']]
    data['is_retweet']              = [1 if b else 0 for b in data['retweeted_status'].notnull()]

    parent_tweet_ids = []
    parent_user_ids  = []

    for index, row in data.iterrows():
        if row['is_retweet']:
            tweet = row['retweeted_status']
            parent_tweet_ids.append(tweet['id_str'])
            parent_user_ids.append(tweet['user']['id_str'])
        else:
            parent_tweet_ids.append(None)
            parent_user_ids.append(None)

    data['parent_tweet_id_str'] = parent_tweet_ids
    data['parent_user_id_str']  = parent_user_ids
    data['mentions'] = data['entities'].apply(lambda x: extract_mentions(x))

    selection = ['id_str',                                      # Id of the tweet for collecting replies / retweets
                 'user_id_str',                                 # Identify / define nodes in the network 
                 'mentions',                                    # Define interaction-edges for mentions
                 'parent_tweet_id_str',
                 'parent_user_id_str',                          # Retweets
                 'in_reply_to_user_id_str']                     # Define interaction-edges for replies

    data = data[selection]
    
    return data

In [3]:
def extract_mentions(x):
    return [str(n.replace("id': ","")) for n in re.findall("id': [0-9]*", str(x))]

def init_edge(src,trg,edges):
    if (str(src),str(trg)) not in edges: edges[(src,trg)] = Counter({'orig_mentions':0, 'retweets':0, 'replies':0})            

def extract_edges(data: pd.DataFrame, edges: dict) -> dict:
    errors = []
    
    for i,row in data.iterrows():
        src      = row['user_id_str']
        par_id   = row['parent_user_id_str']
        reply_id = row['in_reply_to_user_id_str'] 
        mentions = list(row['mentions'])
        
        # Note that mentions are superset of the union of retweets and replies, thus we 
        # first remove both of them, and then the authentic mentions are left.
        
        # Case 1. replies
        if reply_id in mentions:
            init_edge(src,reply_id,edges)
            edges[(src,reply_id)]['replies'] += 1
            mentions.remove(reply_id)
        
        # Case 2. retweets
        if par_id is not None:
            init_edge(src,par_id,edges)
            edges[(src,par_id)]['retweets'] += 1
            try:
                mentions.remove(par_id)
            except Exception as ex:
                errors.append((ex,row['id_str'],src,par_id))
                
        # Case 3. mentions
        for trg in mentions:
            init_edge(src,trg,edges)
            edges[(src,trg)]['orig_mentions'] += 1            
    
        with open('pre_processing_errors.log', 'w') as f:
            for er in errors:
                f.write('\n===\nEX: {0}\nTWID: {1}\nSRC_ID: {2}\nPAR_ID: {3}'.format(
                    er[0],er[1],er[2],er[3]))



### Loading the data
We will be going through all of the keyword streaming data files and preprocessing them one by one.

In [4]:
filepaths = !find ../../tweet_data/filtered/tweets_*

In [5]:
filepaths

['../../tweet_data/filtered/tweets_climate_en_20200812.json']

In [6]:
csize = 50000
debug = True
edge_path = '../../tweet_data/preprocessed/network/edges_{0}.json'
user_map_path = '../../tweet_data/id_mapping/user_mapping.csv'

In [7]:
data_file = None

In [8]:
global_count = 0
iter_times = []

In [9]:
import time
times = []

for i,fpath in enumerate(filepaths):
    print('Preprocessing file: {}'.format(fpath))
    stime = time.time()
    date = re.findall('[0-9]+', fpath)[0]
    data_iter = pd.read_json(fpath,  orient = "records", 
                        dtype = False, lines = True, 
                        encoding = "utf-8", chunksize = csize)
    data = iter(data_iter)
    edges = {}
    
    j = 0
    while True:
        s_time_iter = time.time()
        try:
            df_chunk = next(data)

            # Preprocess the dataframe
            df_chunk = pre_process_dataframe(df_chunk)
            global_count += df_chunk.shape[0]

            #Print iteration stats
            iter_time = time.time()-s_time_iter
            iter_times.append(iter_time)

            print(50*'==','\n* Number of tweets processed: {}'.format((j+1)*csize))
            print('* Time for iteration {}: {:.0f} s'.format(j+1,iter_time))
            print('* Avg time for iteration: {:.1f} s'.format(np.mean(iter_times)))
            print(50*'==','\n')

            print('* Time for iteration {}: {:.0f} s'.format(j+1,iter_time))
            print('* Avg time for iteration: {:.1f} s'.format(np.mean(iter_times)))

            extract_edges(df_chunk, edges)

        except StopIteration:
            break
        except Exception as e:
            print('In file: {} at {}th iteration, exception occurred: {}'.format(fpath,j,e))
            j += 1
            continue
        
        j += 1

    offset = time.time() - stime
    times.append(offset)
    m_time = np.mean(times)
    
    k = len(filepaths) - (i+1)
    t_left = m_time * k 
    h_left = t_left // 3600
    m_left = (t_left % 3600) // 60
    s_left = (t_left % 60) 
    print(50*'=')
    print('Time for iteration: {0} minutes, {1:.2f} seconds'.format(offset // 60, offset % 60 ))
    print('Estimated time left: {0} hours, {1} minutes, {2:.2f} seconds'.format(h_left, m_left, s_left))
    print('Ready with file: {}'.format(fpath.replace('../../data_collection/content-based/', '')))
    print('\t* # of edges: {0}\n'.format(len(edges)))

Preprocessing file: ../../tweet_data/filtered/tweets_climate_en_20200812.json
* Number of tweets processed: 50000
* Time for iteration 1: 14 s
* Avg time for iteration: 13.8 s

* Time for iteration 1: 14 s
* Avg time for iteration: 13.8 s
* Number of tweets processed: 100000
* Time for iteration 2: 13 s
* Avg time for iteration: 13.4 s

* Time for iteration 2: 13 s
* Avg time for iteration: 13.4 s
* Number of tweets processed: 150000
* Time for iteration 3: 12 s
* Avg time for iteration: 13.1 s

* Time for iteration 3: 12 s
* Avg time for iteration: 13.1 s
* Number of tweets processed: 200000
* Time for iteration 4: 12 s
* Avg time for iteration: 12.7 s

* Time for iteration 4: 12 s
* Avg time for iteration: 12.7 s
* Number of tweets processed: 250000
* Time for iteration 5: 12 s
* Avg time for iteration: 12.7 s

* Time for iteration 5: 12 s
* Avg time for iteration: 12.7 s
* Number of tweets processed: 300000
* Time for iteration 6: 14 s
* Avg time for iteration: 12.8 s

* Time for it

#### Pseudonymization

We will create a new randomized user mapping and save it (only locally).

In [10]:
user_map = {}
S = set()

for j,k in edges.items():
    for e in [j[0],j[1]]:
        if e not in user_map:
            while True:
                new_id = np.random.randint(0,2e8)
                if new_id not in S:
                    S.add(new_id)
                    break
            user_map[e] = new_id

with open(user_map_path, 'w') as file:
    file.write('new_id,orig_id')
    for orig_id, new_id in user_map.items():
        file.write('\n{},{}'.format(new_id,orig_id))

#### Save edge data with pseudonymized ids

In [11]:
edgelist = [(user_map[j[0]],user_map[j[1]],dict(k)) for j,k in edges.items()]
with open(edge_path.format(date), 'w') as file:
    json.dump(edgelist, file, allow_nan=False)