# Building the retweet network (part 1)

In [1]:
import pandas as pd
from tqdm import tqdm
import csv

## Data Exploration

In [2]:
!head ../data/tweets_dataset.csv

,id,created_at,user_id,user_screen_name,geo,place,lang,truncated,text,retweet_count,likes_count,retweeted_text,user_mentions,hashtags,urls,is_quote,is_reply,is_original,is_retweet,in_reply_to_status_id,in_reply_to_screen_name,quoted_text,retweeted_status_id,retweeted_user_screen_name,quoted_urls,quoted_hashtags,quoted_created_at,quoted_status_id,quoted_user_screen_name,media,positive,neutral,negative
0,1340465356964585474,2020-12-20 01:13:40+00:00,398032197,AnnaMariaCastel,,,it,,,14,0,"“Per ora” non prevedono di vaccinare i bambini.

“Per ora”.

“Per ora” non vi appendiamo a ganci da macellaio, maledetti emuli dei medici nazisti. https://t.co/t6kNEwKlvW",antonio_bordin;,,,0,0,0,1,,,,1.340235113280512e+18,antonio_bordin,,,,,,,,,
1,1340465632177995776,2020-12-20 01:14:46+00:00,1038231816,EdiGirolami,,,it,,,50,0,"Io sono immunodepresso e prego che la mia terapia sia compatibile col #vaccino anti #Covid_19. 
Voi non avete idea di quanto mi faccia incazzare leggere i #novax con le lo

In [3]:
# Data columns names
pd.read_csv("../data/tweets_dataset.csv", nrows=0).columns

Index(['Unnamed: 0', 'id', 'created_at', 'user_id', 'user_screen_name', 'geo',
       'place', 'lang', 'truncated', 'text', 'retweet_count', 'likes_count',
       'retweeted_text', 'user_mentions', 'hashtags', 'urls', 'is_quote',
       'is_reply', 'is_original', 'is_retweet', 'in_reply_to_status_id',
       'in_reply_to_screen_name', 'quoted_text', 'retweeted_status_id',
       'retweeted_user_screen_name', 'quoted_urls', 'quoted_hashtags',
       'quoted_created_at', 'quoted_status_id', 'quoted_user_screen_name',
       'media', 'positive', 'neutral', 'negative'],
      dtype='object')

## Selecting useful features

To build a retweet network we need only retweets and original tweets (we still need the original tweeter user id).
Since for further analysis may be useful to have also reply tweets and quotation tweets we keep also that ones.

In [4]:
# Manual feature selection
FSELECT = ['id', 'created_at', 'user_id', 'retweeted_status_id', 'in_reply_to_status_id', 'quoted_status_id']

## NOTE: retweeted_status_id does not exist according to the official reference.
## retweeted_status exist according to the official reference.
## retweeted_status_id could be a manually extracted feature from previous works.

# reference: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet

## Dataset filtering and re-build

In [5]:
# Set this to limit the target number of CSV rows to process.
# Set to None to process all rows
ROWS_LIMIT = None #100


# Initialize reading statistics
STATS = {"tweets_processed_count": 0,
         "tweets_corrupted_count": 0,
         "tweets_corrupted": []}


# Open two file streams at the same time: source and destination
with (open("../data/tweets_dataset.csv", 'r', encoding="utf-8") as csv_src,
      open("network_tweets.csv", 'w', encoding="utf-8") as csv_dst):
    
    csv_reader = csv.reader(csv_src, delimiter=',')
    csv_writer = csv.writer(csv_dst, delimiter=',', lineterminator='\n')
    
    for n, row in tqdm(enumerate(csv_reader)):
        
        if n == 0:

            # First line is the header, read and filter the selected features
            name2idx = {name: idx for idx, name in enumerate(row) if name in FSELECT}
            
            # Write te new header in the dest file
            csv_writer.writerow(name2idx.keys())
    
        elif not ROWS_LIMIT or n <= ROWS_LIMIT:
            
            STATS["tweets_processed_count"] += 1
            
            # Values unpacking
            try:   
                name2value = {k: row[idx] for k, idx in name2idx.items()}     
            except IndexError:        
                STATS["tweets_corrupted_count"] += 1
                STATS["tweets_corrupted"].append(row)
                continue # Skip current iteration
                
            # Fixing IDs (from float repr to string)
            def fix_id(id_as_number):
                try:
                    return str(int(eval(id_as_number)))
                except SyntaxError:
                    return ''
                    
            name2value["retweeted_status_id"] = fix_id(name2value["retweeted_status_id"])
            name2value["in_reply_to_status_id"] = fix_id(name2value["in_reply_to_status_id"])
            name2value["quoted_status_id"] = fix_id(name2value["quoted_status_id"])
                
            # Write the new row in the source file
            csv_writer.writerow(name2value.values())
            
        else:

            # When ROWS_LIMIT is not None this will stop the reading loop
            break

12687225it [02:12, 95634.01it/s]


In [6]:
STATS["tweets_processed_count"]

12687224

In [7]:
STATS["tweets_corrupted_count"]

195

In [8]:
# Corrupted tweets have a different number of fields (IndexError)
# Sometimes 'id' is missing (very important field) so they've skipped
# Here the corrupted tweets can be inspected
[print(x, '\n') for x in STATS["tweets_corrupted"]];

['10535', '1341025614191599616', '2020-12-21 14:19:56+00:00', '111260090', 'AsiaNewsIT', '', '', 'it', '', '‘Lecito’ vaccino anti Covid anche se usate linee cellulari provenienti da feti abortiti'] 

['https://t.co/iH61nUdeeH', '2', '3', '', '', '', 'https://tinyurl.com/yaz3svdh;', '0', '0', '1', '0', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] 

['12158', '1341037286423732226', '2020-12-21 15:06:19+00:00', '993080715116937216', 'fordeborah5', '', '', 'it', '', '', '2', '0', '‘Lecito’ vaccino anti Covid anche se usate linee cellulari provenienti da feti abortiti'] 

['https://t.co/iH61nUdeeH', 'AsiaNewsIT;', '', 'https://tinyurl.com/yaz3svdh;', '0', '0', '0', '1', '', '', '', '1.3410256141915996e+18', 'AsiaNewsIT', '', '', '', '', '', '', '', '', ''] 

['47538', '1343925691511939073', '2020-12-29 14:23:48+00:00', '111260090', 'AsiaNewsIT', '', '', 'it', '0.0', 'C’è una responsabilità morale ad accettare il vaccino'] 

['https://t.co/2ltUs4Ukm2', '0', '0', '', '', '', 'https