# Clean, select features and label the samples

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import csv
import re

In [2]:
# Load Newsguard data
newsguard_df = pd.read_csv("data/newsguard_ratings.csv")
newsguard_df

Unnamed: 0,identifier,identifierAlt,rank,healthGuard,country,groupName,siteOnline,parent,score,topline
0,nanotechnology.news,naturalnews.com,N,True,US,naturalnews.com website network,True,naturalnews.com,5.0,A &nbsp;site that is part of a network of hund...
1,yorkshirepost.co.uk,,T,True,GB,,True,,95.0,"The website of the Yorkshire Post, a daily new..."
2,davvero.tv,,TK,False,ALL,,True,,0.0,
3,cdc.gov,cdc.gov,T,True,US,,True,cdc.gov,100.0,The website for the U.S. Centers for Disease C...
4,greenmedinfo.com,greenmedinfo.com,N,True,US,,True,greenmedinfo.com,30.0,A website that has promoted unproven cures for...
...,...,...,...,...,...,...,...,...,...,...
1689,washingtonpost.com,,T,True,US,,True,,100.0,"The website for The Washington Post, a leading..."
1690,ilpiacenza.it,milanotoday.it,T,True,ALL,citynews websites network,True,milanotoday.it,82.5,A network&nbsp;of approximately 50 local newsp...
1691,optimagazine.com,,N,True,ALL,,True,,12.5,A digital magazine about music and pop culture...
1692,reitschuster.de,,N,False,ALL,,True,,59.5,A news blog run by journalist Boris Reitschust...


In [3]:
# Building Newsguard map
newsguard_map = pd.concat([
    newsguard_df[["identifier", "score"]].dropna(),
    newsguard_df[["identifierAlt", "score"]].rename({"identifierAlt": "identifier"}, axis=1).dropna()])

newsguard_map.drop_duplicates(inplace=True)
newsguard_map.sort_values(by="identifier", inplace=True)
newsguard_map.set_index("identifier", inplace=True)
newsguard_map

Unnamed: 0_level_0,score
identifier,Unnamed: 1_level_1
100giornidaleoni.it,30.0
10news.com,100.0
11alive.com,100.0
13wham.com,90.0
14news.com,100.0
...,...
zdf.de,100.0
zdnet.com,100.0
zeit.de,92.5
zenodo.org,0.0


In [4]:
newsguard_map.to_csv("newsguard_map.csv")

In [5]:
# Ratings resolver function
def get_newsguard_ratings(urlist, index_df):
    
    ratings = []
    
    for url in urlist:
        
        try:
            ratings.append(index_df.loc[url.lower()].values[0])
        except KeyError:
            continue
            
    return ratings

## Data preprocess

In [6]:
DATA_SOURCE = "data/tweets_processed_merged_users_2023_02_22.csv"

In [7]:
# Data columns names
[print(n, x) for n, x in enumerate(pd.read_csv(DATA_SOURCE, nrows=1).columns)];

0 Unnamed: 0
1 tweet_id
2 created_at
3 user_id
4 user_screen_name
5 geo
6 place
7 lang
8 truncated
9 text
10 retweet_count
11 likes_count
12 retweeted_text
13 user_mentions
14 hashtags
15 urls
16 is_quote
17 is_reply
18 is_original
19 is_retweet
20 in_reply_to_status_id
21 in_reply_to_user_id
22 in_reply_to_screen_name
23 quoted_text
24 retweeted_status_id
25 retweeted_user_id
26 retweeted_user_screen_name
27 quoted_urls
28 quoted_hashtags
29 quoted_created_at
30 quoted_status_id
31 quoted_user_id
32 quoted_user_screen_name
33 media
34 positive
35 neutral
36 negative


In [8]:
# Manual feature selection
FSELECT = ['tweet_id',
           'created_at',
           'user_id',
           'retweeted_user_id',
           'retweeted_status_id',
           'retweet_count',
           'likes_count',
           'retweeted_text' ]

# reference: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet

## Dataset filtering and re-build

In [9]:
# Set this to limit the target number of CSV rows to process.
# Set to None to process all rows
ROWS_LIMIT = None #10000


TWEETS = {"corrupted": []}


# Initialize statistics
STATS = {"tweets_processed_count": 0,
         "tweets_corrupted_count": 0,
         
         "tweets_accepted": 0,
         "tweets_accepted_originals": 0,
         "tweets_accepted_retweets": 0,
         "tweets_accepted_replies": 0,
         "tweets_accepted_quotations": 0,
         
         "tweets_with_detected_urls": 0,
         "originals_with_detected_urls": 0,
         "retweets_with_detected_urls": 0,
         "replies_with_detected_urls": 0,
         "quotes_with_detected_urls": 0,
         "quotes_with_detected_urls_strict": 0,
         
         "tweets_with_extracted_urls": 0,
         "originals_with_extracted_urls": 0,
         "retweets_with_extracted_urls": 0,
         "replies_with_extracted_urls": 0,
         "quotes_with_extracted_urls": 0,
         "quotes_with_extracted_urls_strict": 0,
        
         "tweets_with_newsguard_urls": 0,
         "originals_with_newsguard_urls": 0,
         "retweets_with_newsguard_urls": 0,
         "replies_with_newsguard_urls": 0,
         "quotes_with_newsguard_urls": 0,
         "quotes_with_newsguard_urls_strict": 0,
         "dataset_total_samples": 0}

    
#url_pattern = r"(?:(?:https?://(?:www\.)?)|(?:[^\.]www\.))(\w+\.\w+)(?:/?)"
#url_pattern = r"(?:(?:https?://(?:www\.)?)|(?:[^\.]www\.))([A-Za-z]\w+\.\w[A-Za-z]+)(?:/?)"
url_pattern = r"(?:(?:https?://www\.)|(?:https?://)|(?:www\.))([\w.-]+\.\w{2,})(?:(?:/)|(?: )|(?:;))?"

url_regex = re.compile(url_pattern)


# Open two file streams at the same time: source and destination
with (open(DATA_SOURCE, 'r', encoding="utf-8") as csv_src,
      open("data/preprocessed_tweets_dataset.csv", 'w', encoding="utf-8") as csv_dst):
    
    csv_reader = csv.reader(csv_src, delimiter=',')
    csv_writer = csv.writer(csv_dst, delimiter=',', lineterminator='\n')
    
    for n, row in tqdm(enumerate(csv_reader)):
        
        if n == 0:
            
            # First line is the header, read and filter the selected features
            name2idx = {name: idx for idx, name in enumerate(row) if name in FSELECT}
            
            # Sort columns as the order of FSELECT
            name2idx = {f: name2idx[f] for f in FSELECT}
                        
            # Write te new header in the dest file
            newsguard_features = ["root_domains", "newsguard_rating"]
            csv_writer.writerow(list(name2idx.keys()) + newsguard_features)
    
        elif not ROWS_LIMIT or n <= ROWS_LIMIT:
            
            STATS["tweets_processed_count"] += 1
            
            # Values unpacking
            if len(row) == 37:
                # Selected features unpack
                name2value = {k: row[idx] for k, idx in name2idx.items()} 
                # Manual features unpack
                is_original = bool(int(row[18]))
                is_retweet = bool(int(row[19]))
                is_reply = bool(int(row[17]))
                is_quote = bool(int(row[16]))
                text = row[9]
                retweeted_text = row[12]
                quoted_text = row[23]
                urls = row[15]
                quoted_urls = row[27]
            else:
                STATS["tweets_corrupted_count"] += 1
                TWEETS["corrupted"].append(row)
                continue # Skip current iteration                
            
            # Get URLs from text
            original_detected_urls = url_regex.findall(text)
            retweet_detected_urls = url_regex.findall(retweeted_text)
            detected_urls = original_detected_urls + retweet_detected_urls
            quote_detected_urls = url_regex.findall(quoted_text)
            
            # Get URLs from previous preprocess
            extracted_urls = url_regex.findall(urls)
            quote_extracted_urls = url_regex.findall(quoted_urls)
            
            # Get ratings from known urls
            ng_ratings = get_newsguard_ratings(extracted_urls, newsguard_map)
            quoted_ng_ratings = get_newsguard_ratings(quote_extracted_urls, newsguard_map)
                
            # General statistics
            STATS["tweets_accepted"] += 1   # If this iteration go on the tweet is accepted
            STATS["tweets_accepted_originals"] += 1 if is_original else 0
            STATS["tweets_accepted_retweets"] += 1 if is_retweet else 0
            STATS["tweets_accepted_replies"] += 1 if is_reply else 0
            STATS["tweets_accepted_quotations"] += 1 if is_quote else 0
            
            # Detected urls with regex statistics
            STATS["tweets_with_detected_urls"] += 1 if detected_urls else 0
            STATS["originals_with_detected_urls"] += 1 if detected_urls and is_original else 0
            STATS["retweets_with_detected_urls"] += 1 if detected_urls and is_retweet else 0
            STATS["replies_with_detected_urls"] += 1 if detected_urls and is_reply else 0
            STATS["quotes_with_detected_urls"] += 1 if detected_urls and is_quote else 0
            STATS["quotes_with_detected_urls_strict"] += 1 if detected_urls and quote_detected_urls else 0
            
            # Extracted urls with previous preprocessing (resolved urls)
            STATS["tweets_with_extracted_urls"] += 1 if extracted_urls else 0
            STATS["originals_with_extracted_urls"] += 1 if extracted_urls and is_original else 0
            STATS["retweets_with_extracted_urls"] += 1 if extracted_urls and is_retweet else 0
            STATS["replies_with_extracted_urls"] += 1 if extracted_urls and is_reply else 0
            STATS["quotes_with_extracted_urls"] += 1 if extracted_urls and is_quote else 0
            STATS["quotes_with_extracted_urls_strict"] += 1 if extracted_urls and quote_extracted_urls else 0
            
            # Newsguard resolved urls
            STATS["tweets_with_newsguard_urls"] += 1 if ng_ratings else 0
            STATS["originals_with_newsguard_urls"] += 1 if ng_ratings and is_original else 0
            STATS["retweets_with_newsguard_urls"] += 1 if ng_ratings and is_retweet else 0
            STATS["replies_with_newsguard_urls"] += 1 if ng_ratings and is_reply else 0
            STATS["quotes_with_newsguard_urls"] += 1 if ng_ratings and is_quote else 0
            STATS["quotes_with_newsguard_urls_strict"] += 1 if ng_ratings and quoted_ng_ratings else 0
            
            # Output file
            if (is_retweet or is_original) and ng_ratings:
                
                STATS["dataset_total_samples"] += 1
                
                # Format and add Newsguard data to current row
                root_domains = " ".join(extracted_urls) # e.g. "root.domain.com \nfoo.bar.it \nzenodo.org"
                newsguard_rating = np.mean(ng_ratings)
                ng_info = [root_domains, newsguard_rating]

                # Write the new row in the destination file
                csv_writer.writerow(list(name2value.values()) + ng_info)
           
        else:

            break # When ROWS_LIMIT is not None this will stop the reading loop

12686129it [03:47, 55854.66it/s]


In [10]:
STATS

{'tweets_processed_count': 12686128,
 'tweets_corrupted_count': 195,
 'tweets_accepted': 12685933,
 'tweets_accepted_originals': 1849092,
 'tweets_accepted_retweets': 6732008,
 'tweets_accepted_replies': 2066315,
 'tweets_accepted_quotations': 2038518,
 'tweets_with_detected_urls': 5113166,
 'originals_with_detected_urls': 1221037,
 'retweets_with_detected_urls': 3362249,
 'replies_with_detected_urls': 217739,
 'quotes_with_detected_urls': 312141,
 'quotes_with_detected_urls_strict': 213464,
 'tweets_with_extracted_urls': 2041006,
 'originals_with_extracted_urls': 1023646,
 'retweets_with_extracted_urls': 629788,
 'replies_with_extracted_urls': 144261,
 'quotes_with_extracted_urls': 243311,
 'quotes_with_extracted_urls_strict': 97586,
 'tweets_with_newsguard_urls': 909488,
 'originals_with_newsguard_urls': 449129,
 'retweets_with_newsguard_urls': 378101,
 'replies_with_newsguard_urls': 74101,
 'quotes_with_newsguard_urls': 8157,
 'quotes_with_newsguard_urls_strict': 2095,
 'dataset_tot

In [11]:
# Corrupted tweets have a different number of fields (IndexError)
# Sometimes 'id' is missing (very important field) so they've skipped
# Here the corrupted tweets can be inspected
[print(x, '\n') for x in TWEETS["corrupted"]];

['32470', '1341025614191599616', '2020-12-21 14:19:56+00:00', '111260090', 'AsiaNewsIT', '', '', 'it', '', '‘Lecito’ vaccino anti Covid anche se usate linee cellulari provenienti da feti abortiti'] 

['https://t.co/iH61nUdeeH', '2', '3', '', '', '', 'https://tinyurl.com/yaz3svdh;', '0', '0', '1', '0', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] 

['34093', '1341037286423732226', '2020-12-21 15:06:19+00:00', '993080715116937216', 'fordeborah5', '', '', 'it', '', '', '2', '0', '‘Lecito’ vaccino anti Covid anche se usate linee cellulari provenienti da feti abortiti'] 

['https://t.co/iH61nUdeeH', 'AsiaNewsIT;', '', 'https://tinyurl.com/yaz3svdh;', '0', '0', '0', '1', '', '', '', '', '1341025614191599616', '111260090', 'AsiaNewsIT', '', '', '', '', '', '', '', '', '', ''] 

['485344', '1343925691511939073', '2020-12-29 14:23:48+00:00', '111260090', 'AsiaNewsIT', '', '', 'it', '0.0', 'C’è una responsabilità morale ad accettare il vaccino'] 

['https://t.co/2ltUs4Ukm2