# Build the dataset from COVID data

Run this notebook to build the dataset from the raw source ```tweets_processed_merged_users_2023_02_22.csv``` fixing broken things

In [1]:
import csv
from tqdm import tqdm
import pandas as pd
import re

## Raw data pre-process

In [2]:
DATA_SOURCE = "tweets_processed_merged_users_2023_02_22.csv"
NEW_DATASET_NAME = "COVID19_2023_02_22_DATASET.csv"

# Set to limit the target number of CSV rows to process.
ROWS_LIMIT = None # 500_000 # 300_000 # Set to None to process all rows

In [3]:
# Print columns names
with open(DATA_SOURCE, 'r', encoding="utf-8") as csv_src:
    header = next(csv_src).split(sep=',')
    for n, col in enumerate(header):
        print(n, ":", col)

0 : 
1 : tweet_id
2 : created_at
3 : user_id
4 : user_screen_name
5 : geo
6 : place
7 : lang
8 : truncated
9 : text
10 : retweet_count
11 : likes_count
12 : retweeted_text
13 : user_mentions
14 : hashtags
15 : urls
16 : is_quote
17 : is_reply
18 : is_original
19 : is_retweet
20 : in_reply_to_status_id
21 : in_reply_to_user_id
22 : in_reply_to_screen_name
23 : quoted_text
24 : retweeted_status_id
25 : retweeted_user_id
26 : retweeted_user_screen_name
27 : quoted_urls
28 : quoted_hashtags
29 : quoted_created_at
30 : quoted_status_id
31 : quoted_user_id
32 : quoted_user_screen_name
33 : media
34 : positive
35 : neutral
36 : negative



In [4]:
# Initialize statistics
MONITOR = {"processed_tweets_count": 0,
           "fixed_tweets_count": 0,
           "broken_tweets": [],
           "fixed_tweets": []}

# Open two file streams at the same time: source and destination
with (open(DATA_SOURCE, 'r', encoding="utf-8") as csv_src, open(NEW_DATASET_NAME, 'w', encoding="utf-8") as csv_dst):
          
    csv_reader = csv.reader(csv_src, delimiter=',', lineterminator='\n')
    csv_writer = csv.writer(csv_dst, delimiter=',', lineterminator='\n')

    row_buffer = []
    
    for n, row in tqdm(enumerate(csv_reader)):

        if n == 0:
            # Determine number of columns
            ncols = len(row)
        
            # Write the header in the dest file
            csv_writer.writerow(row)       
    
        elif not ROWS_LIMIT or n <= ROWS_LIMIT:

            # Counter for stats
            MONITOR["processed_tweets_count"] += 1
            
            # Check if values mismatch
            if len(row) != ncols:

                MONITOR["broken_tweets"].append(','.join(row))
    
                # Fix broken row point
                prev_last_token = "" if len(row_buffer) == 0 else row_buffer.pop() 
                next_first_token = "" if len(row) == 0 else row.pop(0)
                fixed_token = prev_last_token + next_first_token
                row_buffer.append(fixed_token)
                row_buffer += row
    
                # Check if the row is completely fixed
                if len(row_buffer) == ncols:
                    
                    MONITOR["fixed_tweets_count"] += 1
                    MONITOR["fixed_tweets"].append(','.join(row_buffer))
                    
                    row = row_buffer
                    row_buffer = [] # Reset the row buffer for next broken rows
                    
                else:
                    
                    continue # Skip current iteration
    
            # Write the row in the destination file (handle special characters)
            row = [token.replace('\n', "<return>") for token in row]
            csv_writer.writerow(row)
    
        else:
            
            break # When ROWS_LIMIT is not None this will stop the reading loop

print(f"Processed {MONITOR['processed_tweets_count']} tweets, fixed: {MONITOR['fixed_tweets_count']}")

12686129it [02:15, 93891.04it/s]

Processed 12686128 tweets, fixed: 95





## Twitter data loading

In [5]:
# Specify columns to read
use_cols = ["tweet_id",
            "created_at",
            "user_id",
            "urls",
            "is_original",
            "is_retweet",
            "retweeted_status_id",
            "retweeted_user_id"]

# Set datatypes
dtypes = {"tweet_id": str,
          "created_at": object,
          "user_id": str,
          "urls": str,
          "is_original": int,
          "is_retweet": int,
          "retweeted_status_id": str,
          "retweeted_user_id": str}

# Load the data
twitter_df = pd.read_csv(NEW_DATASET_NAME, usecols=use_cols, dtype=dtypes)

# Parse date time
twitter_df["created_at"] = pd.to_datetime(twitter_df["created_at"])

# Keep only tweets and retweets
twitter_df = twitter_df[(twitter_df.is_original == 1) | (twitter_df.is_retweet == 1)]

In [6]:
# Show data
twitter_df

Unnamed: 0,tweet_id,created_at,user_id,urls,is_original,is_retweet,retweeted_status_id,retweeted_user_id
0,1340465356964585474,2020-12-20 01:13:40+00:00,398032197,,0,1,1340235113280512001,438849246
1,1340465632177995776,2020-12-20 01:14:46+00:00,1038231816,,0,1,1340372051840950273,38032906
2,1340465924932063236,2020-12-20 01:15:55+00:00,457762156,,0,1,1340309122730893313,1683455144
3,1340465989436268547,2020-12-20 01:16:11+00:00,1312802310830125057,,0,1,1340391589135454209,1082930003166273543
4,1340466102380457984,2020-12-20 01:16:38+00:00,1030696837748060160,,0,1,1340408286730989568,1935745573
...,...,...,...,...,...,...,...,...
12686020,1451536603365756933,2021-10-22 13:11:09+00:00,3046618084,,0,1,1451536542271479821,210501383
12686023,1451536623410327554,2021-10-22 13:11:13+00:00,1683455144,,0,1,1445346610205036552,1683455144
12686024,1451536633338245120,2021-10-22 13:11:16+00:00,1325077209237905408,,0,1,1451536503348375560,1123567903297671170
12686025,1451536659049295885,2021-10-22 13:11:22+00:00,1435676887603458061,,0,1,1451427132173914124,1360232480


## Newsguard data

In [7]:
# Load Newsguard data
newsguard_df = pd.read_csv("newsguard_ratings.csv")
newsguard_df

Unnamed: 0,identifier,identifierAlt,rank,healthGuard,country,groupName,siteOnline,parent,score,topline
0,nanotechnology.news,naturalnews.com,N,True,US,naturalnews.com website network,True,naturalnews.com,5.0,A &nbsp;site that is part of a network of hund...
1,yorkshirepost.co.uk,,T,True,GB,,True,,95.0,"The website of the Yorkshire Post, a daily new..."
2,davvero.tv,,TK,False,ALL,,True,,0.0,
3,cdc.gov,cdc.gov,T,True,US,,True,cdc.gov,100.0,The website for the U.S. Centers for Disease C...
4,greenmedinfo.com,greenmedinfo.com,N,True,US,,True,greenmedinfo.com,30.0,A website that has promoted unproven cures for...
...,...,...,...,...,...,...,...,...,...,...
1689,washingtonpost.com,,T,True,US,,True,,100.0,"The website for The Washington Post, a leading..."
1690,ilpiacenza.it,milanotoday.it,T,True,ALL,citynews websites network,True,milanotoday.it,82.5,A network&nbsp;of approximately 50 local newsp...
1691,optimagazine.com,,N,True,ALL,,True,,12.5,A digital magazine about music and pop culture...
1692,reitschuster.de,,N,False,ALL,,True,,59.5,A news blog run by journalist Boris Reitschust...


In [8]:
# Build Newsguard map
newsguard_map = pd.concat([
    newsguard_df[["identifier", "score"]].dropna(),
    newsguard_df[["identifierAlt", "score"]].rename(columns={"identifierAlt": "identifier"}).dropna()])

newsguard_map.drop_duplicates(inplace=True)
newsguard_map = newsguard_map.sort_values(by="identifier")
newsguard_map = newsguard_map.set_index("identifier")
newsguard_map

Unnamed: 0_level_0,score
identifier,Unnamed: 1_level_1
100giornidaleoni.it,30.0
10news.com,100.0
11alive.com,100.0
13wham.com,90.0
14news.com,100.0
...,...
zdf.de,100.0
zdnet.com,100.0
zeit.de,92.5
zenodo.org,0.0


## Labeling

Some auxiliary functions for domains extractions from urls and scoring using Newsguard

In [9]:
# Extract domains from strings
def extract_domains(input_string):
    pattern = r"(?:(?:https?://www\.)|(?:https?://)|(?:www\.))([\w.-]+\.\w{2,})(?:(?:/)|(?: )|(?:;))?"
    matches = re.findall(pattern, input_string)
    return matches

# Example usage:
input_string = "Check out https://www.repubblica.it/covid-vaccino-notizia; and http://another-example.org for more info."
result = extract_domains(input_string)
print(result)

# Resolve the score for a tweet
def newsguard_score(input_string):
    scores = []
    for domain in extract_domains(input_string):
        try:
            scores.append(newsguard_map.loc[domain].values[0])
        except KeyError:
            pass
            
    if scores:
        return sum(scores) / len(scores)
    else:
        return -1 # Cannot score any of the urls (no newsguard data)

# Test the function
newsguard_score(input_string)

['repubblica.it', 'another-example.org']


95.0

In [10]:
# Label the tweets (retweets will have the score of the original tweet), discard unusable data
labeled_df = twitter_df.loc[-twitter_df.urls.isna()].copy()
labeled_df["credibility_score"] = labeled_df["urls"].apply(newsguard_score)
labeled_df["retweeted_status_id"] = labeled_df["retweeted_status_id"].fillna("ORIGIN")
labeled_df["retweeted_user_id"] = labeled_df["retweeted_user_id"].fillna("AUTHOR")
labeled_df.drop(["urls", "is_original", "is_retweet"], axis=1, inplace=True)
labeled_df = labeled_df[labeled_df.credibility_score != -1.0]

In [11]:
labeled_df

Unnamed: 0,tweet_id,created_at,user_id,retweeted_status_id,retweeted_user_id,credibility_score
28,1340468299025551360,2020-12-20 01:25:21+00:00,497188910,1340325850378592257,1017807360075665408,64.5
35,1340468728534884354,2020-12-20 01:27:04+00:00,924336025387913221,ORIGIN,AUTHOR,95.0
66,1340473042129080320,2020-12-20 01:44:12+00:00,47148805,ORIGIN,AUTHOR,100.0
74,1340474125656190978,2020-12-20 01:48:31+00:00,1022891525242593280,ORIGIN,AUTHOR,5.0
98,1340477947627581440,2020-12-20 02:03:42+00:00,908206586,ORIGIN,AUTHOR,95.0
...,...,...,...,...,...,...
12685947,1451536212381011970,2021-10-22 13:09:35+00:00,1329549977463508993,1450461737895989261,4758512368,39.5
12685948,1451536215858155528,2021-10-22 13:09:36+00:00,1446561632105205774,1451536139131801605,1446561632105205774,82.5
12685950,1451536226876526601,2021-10-22 13:09:39+00:00,712365073,1451310094461947909,1173682089637625856,12.5
12685971,1451536327388942348,2021-10-22 13:10:03+00:00,1036611673,1451536158098399236,1260895357548068865,20.0


In [12]:
# Save the full dataset
labeled_df.to_csv(NEW_DATASET_NAME, index=False)