In [1]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');


<IPython.core.display.Javascript object>

# Imports

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" # the GPU on robinson

In [4]:
from rdflib import Graph, Literal, URIRef, XSD
from rdflib.namespace import RDF, FOAF, SDO, Namespace
import json
import pandas as pd
from tqdm import tqdm, trange
import html
from datetime import datetime
import io
import requests
import hashlib
import re
import urllib.parse


from polyglot.detect import Detector
from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")


import statistics
import textstat

import torch
import torch.nn as nn
from transformers import BertForPreTraining, AutoTokenizer

# Utils

In [5]:
def normalize_text(text):
    text = text.replace('&amp;', '&')
    text = text.replace('\xa0', '')
    text = re.sub(r'http\S+', '', text)
    text = html.unescape(text)
    text = " ".join(text.split())
    return text

def uri_generator(identifier):
    h = hashlib.sha224(str.encode(identifier)).hexdigest()
    
    return str(h)

CONSPIRACIES = ['Suppressed Cures', 'Behaviour and mind Control', 'Antivax', 'Fake virus', 'Intentional Pandemic', 'Harmful Radiation', 'Population Reduction', 'New World Order', 'Satanism']


In [6]:
start_date = datetime.strptime('2020-01-01', '%Y-%m-%d')
end_date = datetime.strptime('2021-06-30', '%Y-%m-%d')

In [7]:
SCHEMA = Namespace("http://schema.org/")
CIMPLE = Namespace("http://data.cimple.eu/ontology#")
NIF = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")

WIKI_prefix = "http://www.wikidata.org/entity/"
DB_prefix = "http://dbpedia.org/ontology/"

prefix = "http://data.cimple.eu/"


# Data

## BirdWatch

In [213]:
bw_path = '/data/peskine/birdwatch/'


In [214]:
BW_CR = pd.read_csv(bw_path+'BW_CR_clean_merged.csv')
BW_CR

Unnamed: 0,tweetId,Tweet,noteId,summary,classification,CR Fact,credibility,full_text,CR Doc
0,1380985371794796545,Trump won the election by a landslide,1382005896901894144,"According to numerous independent sources, Tru...",MISINFORMED_OR_POTENTIALLY_MISLEADING,"['Donald Trump won the 2020 election, by a lot']",not_credible,Trump won the election by a landslide,
1,1388990782015692810,DONALD TRUMP WON THE 2020 ELECTION BY A LOT!\n...,1389613071895711746,"According to numerous independent sources, Tru...",MISINFORMED_OR_POTENTIALLY_MISLEADING,"['Donald Trump won the 2020 election, by a lot']",not_credible,DONALD TRUMP WON THE 2020 ELECTION BY A LOT!\n...,
2,1435755006867673092,President Trump won https://t.co/GnG8zTlSbj,1436788728794656774,Trump lost and Biden won the 2020 election. ...,MISINFORMED_OR_POTENTIALLY_MISLEADING,['US President Donald Trump has won the 2020 e...,not_credible,President Trump won https://t.co/GnG8zTlSbj,"[{""claim_text"": [""US President Donald Trump ha..."
3,1402642394483462149,Daily reminder:\n\nDonald J. Trump won the ele...,1403409669570252800,"According to numerous independent sources, Tru...",MISINFORMED_OR_POTENTIALLY_MISLEADING,['US President Donald Trump has won the 2020 e...,not_credible,Daily reminder:\n\nDonald J. Trump won the ele...,"[{""claim_text"": [""US President Donald Trump ha..."
4,1399207201382731777,Donald J. Trump won BIGLY.,1400161353084510212,"According to numerous independent sources, Tru...",MISINFORMED_OR_POTENTIALLY_MISLEADING,['Donald Trump has won the re-election against...,not_credible,Donald J. Trump won BIGLY.,"[{""claim_text"": [""Donald Trump has won the re-..."
...,...,...,...,...,...,...,...,...,...
3038,1379458652323639300,246 fully vaccinated Michiganders got COVID-19...,1380365994611335169,As of 4/7/21 1.8 million Michigan residents ha...,MISINFORMED_OR_POTENTIALLY_MISLEADING,"""['Four states have stopped administering the ...",uncertain,246 fully vaccinated Michiganders got COVID-19...,"[{""claim_text"": [""Four states have stopped adm..."
3039,1375128381696700419,Weve got our first deportation of the day http...,1375470560810049546,The Toronto Raptors are playing every home gam...,MISINFORMED_OR_POTENTIALLY_MISLEADING,"""['the image shows one of nine parents who reu...",not_credible,We’ve got our first deportation of the day htt...,
3040,1399934281774899200,Early #FauciEmails show awareness of potential...,1399984720599994371,It is true that there were concerns about enha...,MISINFORMED_OR_POTENTIALLY_MISLEADING,"""['COVID-19 no more lethal than flu there is n...",not_credible,Early #FauciEmails show awareness of potential...,"[{""claim_text"": [""COVID-19 no more lethal than..."
3041,1391101810073538563,Palestinians chant bomb bomb Tel Aviv from Jud...,1391879804291538948,Claim is clearly backed up with a video showin...,NOT_MISLEADING,"""['Video shows Israeli forces attacking mosque...",not_credible,Palestinians chant “bomb bomb Tel Aviv” from J...,"[{""claim_text"": [""Video shows Israeli forces a..."


In [215]:
with open(bw_path+'idtweetVALUES.jsonl', 'r') as j:
    tweets_list = list(j)

tweets_notes = pd.read_csv(bw_path+'notes-00000.tsv', sep='\t')

len(tweets_list), len(tweets_notes)

(10211, 15445)

In [216]:
tweets_list_timerange = []

for j in tweets_list:
    file = json.loads(j)
    date_str= datetime.strptime(file['created_at'], '%a %b %d %H:%M:%S %z %Y').replace(tzinfo=None) #.strftime('%Y-%m-%d')
    
    if date_str >= start_date and date_str <= end_date:
        tweets_list_timerange.append(file)

len(tweets_list_timerange)

6563

In [217]:
df = pd.read_csv(bw_path+'factors.csv')
emotions = df['emotions'].tolist()
biases = df['biases'].tolist()
sentiments = df['sentiments'].tolist()
cons = df['conspiracies'].tolist()
with open(bw_path+"entities.json", "r") as outfile:
    ents = json.load(outfile)
len(df), len(ents)

(6563, 6563)

### BW - Graph

In [218]:
g = Graph()


In [219]:
for i in trange(0, len(tweets_list_timerange)):
    t = tweets_list_timerange[i]
    tid = t['id']
    
    identifier = 'tweet'+str(tid)
    uri = 'tweet/'+uri_generator(identifier)
    
    g.add((URIRef(prefix+uri), RDF.type, SCHEMA.SocialMediaPosting))
    
    d = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S %z %Y').replace(tzinfo=None)
    g.add((URIRef(prefix+uri), SCHEMA.dateCreated, Literal(d.strftime('%Y-%m-%d'), datatype=XSD.date)))
    
    text = t['full_text']
    text = normalize_text(text)
    g.add((URIRef(prefix+uri), SCHEMA.text, Literal(text)))
    
    r = measure_readability(text)
    g.add((URIRef(prefix+uri), CIMPLE.readability_score, Literal(r)))
    
    e = emotions[i]
    if e != 'None':
        g.add((URIRef(prefix+uri), CIMPLE.hasEmotion, URIRef(prefix+'emotion/'+str(e.lower()))))
    s = sentiments[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasSentiment, URIRef(prefix+'sentiment/'+str(s.lower()))))
    b = biases[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasPoliticalLeaning, URIRef(prefix+'political-leaning/'+str(b.lower()))))
    cons_i = json.loads(cons[i])
    for k in range(0, len(cons_i)):
        if cons_i[k] == 1:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri), CIMPLE.mentionsConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
        elif cons_i[k] == 2:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri), CIMPLE.promotesConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
    
    entities = ents[i]
    if 'Resources' in entities:
        for ent in entities['Resources']:
            dbpedia_url = ent['@URI']
#             uri_mention = 'entity/'+uri_generator(dbpedia_url)
#             dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                
            
#             g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
#             e_types = e['@types'].split(',')
#             for t in e_types:
#                 if "Wikidata" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
#                 if "DBpedia" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

#             g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
#             g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

            g.add((URIRef(prefix+uri), SCHEMA.mentions, URIRef(dbpedia_url)))
    
    row = BW_CR[BW_CR['tweetId']==tid]
    
    #BW tweets can have multiple notes (reviews)
    for review in range(0, len(row['noteId'].tolist())):
        
        identifier_review = 'review'+str(row['noteId'].tolist()[review])
        uri_review = 'review/'+uri_generator(identifier_review)

        g.add((URIRef(prefix+uri_review), RDF.type, SCHEMA.Review))
        g.add((URIRef(prefix+uri_review), SCHEMA.itemReviewed, URIRef(prefix+uri)))
        
        rating = row['classification'].tolist()[review]
        uri_rating = 'rating/'+rating.replace(' ', '_').lower()

        g.add((URIRef(prefix+uri_rating), RDF.type, SCHEMA.Rating))
        
        summary = row['summary'].tolist()[review]
        g.add((URIRef(prefix+uri_rating), SCHEMA.ratingExplanation, Literal(summary)))
        g.add((URIRef(prefix+uri_rating), SCHEMA.ratingValue, Literal(rating)))
        
        r = measure_readability(summary)
        g.add((URIRef(prefix+uri_review), CIMPLE.readability_score, Literal(r)))
        
        ents_summary = extract_dbpedia_entities(summary)
        if 'Resources' in ents_summary:
            for ent in ents_summary['Resources']:
                dbpedia_url = ent['@URI']
#                 uri_mention = 'entity/'+uri_generator(dbpedia_url)
#                 dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                

#                 g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
#                 e_types = e['@types'].split(',')
#                 for t in e_types:
#                     if "Wikidata" in t:
#                         g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
#                     if "DBpedia" in t:
#                         g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

#                 g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
#                 g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

                g.add((URIRef(prefix+uri_review), SCHEMA.mentions, URIRef(dbpedia_url)))

        g.add((URIRef(prefix+uri_review), SCHEMA.reviewRating, URIRef(prefix+uri_rating)))
        
        #matched claim review
        if row['CR Doc'].tolist()[0]==row['CR Doc'].tolist()[0]:
            cr_doc = json.loads(row['CR Doc'].tolist()[review])[0]

            identifier_cr = 'claim-review'+cr_doc['claim_text'][0]+cr_doc['label']+cr_doc['review_url']
            uri_cr = 'claim-review/'+uri_generator(identifier_cr)
            
            g.add((URIRef(prefix+uri), CIMPLE.related, URIRef(prefix+uri_cr)))
            
#             g.add((URIRef(prefix+uri_cr), RDF.type, SCHEMA.ClaimReview))

#             author = cr_doc['fact_checker']['name']
#             website = cr_doc['fact_checker']['website']
#             identifier_author = 'organization'+str(author)
            
#             uri_author = 'organization/'+uri_generator(identifier_author)

#             g.add((URIRef(prefix+uri_author), RDF.type, SCHEMA.Organization))
#             g.add((URIRef(prefix+uri_author), SCHEMA.name, Literal(author)))
#             g.add((URIRef(prefix+uri_author), SCHEMA.url, URIRef(website)))

#             g.add((URIRef(prefix+uri_cr), SCHEMA.author, URIRef(prefix+uri_author)))

#             date = cr_doc['reviews'][0]['date_published']
#             date = datetime.strptime(date, '%Y-%m-%d')
#             g.add((URIRef(prefix+uri_cr), SCHEMA.datePublished, Literal(date, datatype=XSD.date)))

#             url = cr_doc['review_url']
#             url = url.replace(' ', '')
#             g.add((URIRef(prefix+uri_cr), SCHEMA.url, URIRef(url)))

#             language = cr_doc['fact_checker']['language']
#             g.add((URIRef(prefix+uri_cr), SCHEMA.inLanguage, Literal(language)))

#             uri_normalized_rating = 'rating/'+cr_doc['reviews'][0]['label']

#             g.add((URIRef(prefix+uri_normalized_rating), RDF.type, SCHEMA.Rating))
#             g.add((URIRef(prefix+uri_normalized_rating), SCHEMA.ratingValue, Literal(cr_doc['reviews'][0]['label'])))

#             g.add((URIRef(prefix+uri_cr), CIMPLE.normalizedReviewRating, URIRef(prefix+uri_normalized_rating)))

#             uri_original_rating = 'rating/'+uri_generator(cr_doc['reviews'][0]['original_label'])

#             g.add((URIRef(prefix+uri_original_rating), RDF.type, SCHEMA.Rating))
#             g.add((URIRef(prefix+uri_original_rating), SCHEMA.ratingValue, Literal(cr['reviews'][0]['original_label'].replace('_', ' '))))

#             g.add((URIRef(prefix+uri_cr), SCHEMA.reviewRating, URIRef(prefix+uri_original_rating)))


#             claim = cr_doc['claim_text'][0]
#             identifier_claim = 'claim'+claim
#             uri_claim = 'claim/'+uri_generator(identifier_claim)

#             #SCHEMA.Claim has not yet been integrated
#             #This term is proposed for full integration into Schema.org, pending implementation feedback and adoption from applications and websites. 
#             g.add((URIRef(prefix+uri_claim),RDF.type, SCHEMA.Claim))

#             g.add((URIRef(prefix+uri_cr), SCHEMA.itemReviewed, URIRef(prefix+uri_claim)))

#             text_claim = claim
#             text_claim = normalize_text(text_claim)
#             g.add((URIRef(prefix+uri_claim),SCHEMA.text, Literal(text_claim)))
            
#             appearances = cr_doc['appearances']
#             for a in appearances:
#                 identifier_appearance = 'appearance'+str(a)
#                 uri_appearance = 'appearance/'+uri_generator(identifier_appearance)
#                 g.add((URIRef(prefix+uri_appearance), RDF.type, SCHEMA.CreativeWork))
#                 g.add((URIRef(prefix+uri_appearance), SCHEMA.url, URIRef(a)))
#                 g.add((URIRef(prefix+uri_claim), SCHEMA.appearance, URIRef(prefix+uri_appearance)))
                
#             if prefix+uri_cr in d_factors_cr:
#                 factors_cr = d_factors_cr[prefix+uri_cr]

#                 e = factors_cr[1]
#                 if e != 'None':
#                     g.add((URIRef(prefix+uri_claim), CIMPLE.hasEmotion, URIRef(prefix+'emotion/'+str(e.lower()))))
#                 b = factors_cr[2]
#                 g.add((URIRef(prefix+uri_claim), CIMPLE.hasPoliticalLeaning, URIRef(prefix+'political-leaning/'+str(b.lower()))))
#                 s = factors_cr[3]
#                 g.add((URIRef(prefix+uri_claim), CIMPLE.hasSentiment, URIRef(prefix+'sentiment/'+str(s.lower()))))
#                 cons_i = json.loads(factors_cr[4])
#                 for k in range(0, len(cons_i)):
#                     if cons_i[k] == 1:
#                         c = CONSPIRACIES[k]
#                         g.add((URIRef(prefix+uri_claim), CIMPLE.mentionsConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
#                     elif cons_i[k] == 2:
#                         c = CONSPIRACIES[k]
#                         g.add((URIRef(prefix+uri_claim), CIMPLE.promotesConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))

#                 entities = factors_cr[0]
#                 if 'Resources' in entities:
#                     for e in entities['Resources']:
#                         dbpedia_url = e['@URI']
# #                         uri_mention = 'entity/'+uri_generator(dbpedia_url)
# #                         dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                

# #                         g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
# #                         e_types = e['@types'].split(',')
# #                         for t in e_types:
# #                             if "Wikidata" in t:
# #                                 g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
# #                             if "DBpedia" in t:
# #                                 g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

# #                         g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
# #                         g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

#                         g.add((URIRef(prefix+uri_claim), SCHEMA.mentions, URIRef(dbpedia_url)))

            
            

        



100%|███████████████████████████████████████████| 6563/6563 [03:56<00:00, 27.75it/s]


In [220]:
len(g)

72932

In [221]:
g.serialize(destination="/data/peskine/KGs/birdwatch-v3.ttl")

<Graph identifier=N92d07c3e633c437ba3f772d80e1384b8 (<class 'rdflib.graph.Graph'>)>

## MediaEval

In [189]:
me_path = '/data/peskine/mediaeval22/'

df_me = pd.read_csv(me_path+'task-1-train-and-test-merged.csv')
df_me

Unnamed: 0,tweet_id,class_label_for_Suppressed_cures_category,class_label_for_Behaviour_and_Mind_Control_category,class_label_for_Antivax_category,class_label_for_Fake_virus_category,class_label_for_Intentional_Pandemic_category,class_label_for_Harmful_Radiation_Influence_category,class_label_for_Population_reduction_Control_category,class_label_for_New_World_Order_category,class_label_for_Satanism_category,tweet_text
0,1,1,1,1,3,1,3,1,1,1,None! But I just saw a very frightening video ...
1,2,1,1,1,1,1,1,1,1,1,How many CV Deaths were 80 yrs or older? Stats...
2,3,1,1,1,3,1,1,1,1,1,Remember this is all based on flawed data! Flu...
3,4,1,1,1,1,1,1,1,1,1,Why did trump believe China over our own Intel...
4,5,1,1,1,1,1,1,1,1,1,There are no more kids in cages there's a push...
...,...,...,...,...,...,...,...,...,...,...,...
2730,30826,1,1,1,1,3,1,3,1,1,If Hillary were president we would have lost o...
2731,30827,1,1,1,1,2,1,1,1,1,Can’t believe after all this time people are s...
2732,30828,1,1,1,1,3,1,3,1,1,If you believe that all of this chaos and deat...
2733,30829,1,1,1,1,1,1,1,1,1,"...what? Qanon isn't ""against covid"" or ""again..."


In [190]:
df = pd.read_csv(me_path+'factors.csv')
emotions = df['emotions'].tolist()
biases = df['biases'].tolist()
sentiments = df['sentiments'].tolist()
cons = df['conspiracies'].tolist()
with open(me_path+"entities.json", "r") as outfile:
    ents = json.load(outfile)
len(df), len(ents)

(2735, 2735)

### ME - Graph

In [191]:
g = Graph()

In [192]:
for i in trange(0, len(df_me)):
    text = df_me["tweet_text"].tolist()[i]
    identifier = 'tweet'+str(text)
    uri = 'tweet/'+uri_generator(identifier)
    
    g.add((URIRef(prefix+uri), RDF.type, SCHEMA.SocialMediaPosting))
    
    text = normalize_text(text)
    g.add((URIRef(prefix+uri), SCHEMA.text, Literal(text)))
    
    r = measure_readability(text)
    g.add((URIRef(prefix+uri), CIMPLE.readability_score, Literal(r)))

    
    e = emotions[i]
    if e != 'None':
        g.add((URIRef(prefix+uri), CIMPLE.hasEmotion, URIRef(prefix+'emotion/'+str(e.lower()))))
    s = sentiments[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasSentiment, URIRef(prefix+'sentiment/'+str(s.lower()))))
    b = biases[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasPoliticalLeaning, URIRef(prefix+'political-leaning/'+str(b.lower()))))
    gt = df_me.iloc[i,1:10].tolist()
    for c in range(0, 9):
        if gt[c]==2:
            g.add((URIRef(prefix+uri), CIMPLE.mentionsConspiracy, URIRef(prefix+'conspiracy/'+CONSPIRACIES[c].replace(' ', '_'))))
        elif gt[c]==3:
            g.add((URIRef(prefix+uri), CIMPLE.promotesConspiracy, URIRef(prefix+'conspiracy/'+CONSPIRACIES[c].replace(' ', '_'))))
    
    entities = ents[i]
    if 'Resources' in entities:
        for ent in entities['Resources']:
            dbpedia_url = ent['@URI']
#             uri_mention = 'entity/'+uri_generator(dbpedia_url)
#             dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                
            
#             g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
#             e_types = e['@types'].split(',')
#             for t in e_types:
#                 if "Wikidata" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
#                 if "DBpedia" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

#             g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
#             g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

            g.add((URIRef(prefix+uri), SCHEMA.mentions, URIRef(dbpedia_url)))     
    
        

            

100%|██████████████████████████████████████████| 2735/2735 [00:04<00:00, 666.39it/s]


In [193]:
len(g)

28745

In [194]:
g.serialize(destination="/data/peskine/KGs/mediaeval-v3.ttl")

<Graph identifier=Ne3140d6c7cde404e810eeeacac17c20f (<class 'rdflib.graph.Graph'>)>

## Check-That 2022

In [195]:
ct_path = '/data/peskine/clef2022-checkthat-lab/task2/'

df_tweets = pd.read_csv(ct_path+'data/subtask-2a--english/'+'tweets-train-dev.tsv', sep='\t',  names=['tweets', 'text'])
df_articles = pd.read_csv(ct_path+'data/subtask-2a--english/'+'filtered_dates_CR_doc.csv')

In [196]:
df_match_train = pd.read_csv(ct_path+'data/subtask-2a--english/'+'qrels-train.tsv', sep='\t', names=['tweets', '0', 'vclaims', '1'])
df_match_dev = pd.read_csv(ct_path+'data/subtask-2a--english/'+'qrels-dev.tsv', sep='\t', names=['tweets', '0', 'vclaims', '1'])
df_match_full = df_match_train.append(df_match_dev)

In [197]:
factors_articles = pd.read_csv(ct_path+'articles_factors.csv')
emotions = factors_articles['emotions'].tolist()
biases = factors_articles['biases'].tolist()
sentiments = factors_articles['sentiments'].tolist()
cons = factors_articles['conspiracies'].tolist()
with open(ct_path+"entities_articles.json", "r") as outfile:
    ents_articles = json.load(outfile)
len(factors_articles), len(ents_articles)

(2543, 2543)

### CT - Graph

In [198]:
g = Graph()

In [199]:
count = 0
for i in trange(0, len(df_articles)):
    row = df_articles.iloc[i, :]
    if row['CR Doc']==row['CR Doc']:
        count+=1

100%|████████████████████████████████████████| 2543/2543 [00:00<00:00, 10153.20it/s]


In [200]:
errors = []
for i in trange(0, len(df_articles)):
    row = df_articles.iloc[i, :]
    if row['CR Doc']==row['CR Doc']:
        cr_doc = json.loads(row['CR Doc'])
        identifier = 'claim-review'+cr_doc['claim_text'][0]+cr_doc['label']+cr_doc['review_url']

    else:
        identifier = 'claim-review'+row['vclaim_id']
        uri = 'claim-review/'+uri_generator(identifier)
        g.add((URIRef(prefix+uri), RDF.type, SCHEMA.ClaimReview))

    
        identifier_sno = 'organization/'+'Snopes.com'
        uri_sno = 'organization/'+uri_generator(identifier_sno)
        g.add((URIRef(prefix+uri), SCHEMA.author, URIRef(prefix+uri_sno)))
    
        dateline = row['date']
        if dateline == dateline:
            date = datetime.strptime(dateline, "%Y-%m-%d") 
            g.add((URIRef(prefix+uri), SCHEMA.dateline, Literal(date.strftime('%Y-%m-%d'), datatype=XSD.date)))
        
        vclaim = row['vclaim']
        if vclaim==vclaim:        
            identifier_vclaim = 'claim'+str(vclaim)
            uri_vclaim = 'claim/'+uri_generator(identifier_vclaim)
            vclaim = normalize_text(vclaim)
            g.add((URIRef(prefix+uri_vclaim), RDF.type, SCHEMA.Claim))
            g.add((URIRef(prefix+uri_vclaim), SCHEMA.text, Literal(vclaim)))

            g.add((URIRef(prefix+uri), SCHEMA.itemReviewed, URIRef(prefix+uri_vclaim)))

            r = measure_readability(vclaim)
            g.add((URIRef(prefix+uri_vclaim), CIMPLE.readability_score, Literal(r)))
    
    
    headline = row['title']
    if headline==headline:
        headline = normalize_text(headline)
        g.add((URIRef(prefix+uri), SCHEMA.headline, Literal(headline)))
            
    body = row['subtitle']
    if body==body:
        body = normalize_text(body)
        g.add((URIRef(prefix+uri), SCHEMA.alternativeHeadline, Literal(body)))
    
    r = measure_readability(headline+'\n'+body)
    g.add((URIRef(prefix+uri), CIMPLE.readability_score, Literal(r)))

    e = emotions[i]
    if e != 'None':
        g.add((URIRef(prefix+uri), CIMPLE.hasEmotion, URIRef(prefix+'emotion/'+str(e.lower()))))
    s = sentiments[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasSentiment, URIRef(prefix+'sentiment/'+str(s.lower()))))
    b = biases[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasPoliticalLeaning, URIRef(prefix+'political-leaning/'+str(b.lower()))))
    cons_i = json.loads(cons[i])
    for k in range(0, len(cons_i)):
        if cons_i[k] == 1:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri), CIMPLE.mentionsConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
        elif cons_i[k] == 2:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri), CIMPLE.promotesConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
    
    entities = ents[i]
    if 'Resources' in entities:
        for ent in entities['Resources']:
            dbpedia_url = ent['@URI']
#             uri_mention = 'entity/'+uri_generator(dbpedia_url)
#             dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                
            
#             g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
#             e_types = e['@types'].split(',')
#             for t in e_types:
#                 if "Wikidata" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
#                 if "DBpedia" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

#             g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
#             g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

            g.add((URIRef(prefix+uri), SCHEMA.mentions, URIRef(dbpedia_url)))

#     if row['CR Doc']==row['CR Doc']:
#         cr_doc = json.loads(row['CR Doc'])
        
#         author = cr_doc['fact_checker']['name']
#         website = cr_doc['fact_checker']['website']
#         identifier_author = 'organization'+str(author)
#         uri_author = 'organization/'+uri_generator(identifier_author)

#         g.add((URIRef(prefix+uri_author), RDF.type, SCHEMA.Organization))
#         g.add((URIRef(prefix+uri_author), SCHEMA.name, Literal(author)))
#         g.add((URIRef(prefix+uri_author), SCHEMA.url, URIRef(website)))

#         g.add((URIRef(prefix+uri), SCHEMA.author, URIRef(prefix+uri_author)))

#         date = cr_doc['reviews'][0]['date_published']
#         date = datetime.strptime(date, '%Y-%m-%d')
#         g.add((URIRef(prefix+uri), SCHEMA.datePublished, Literal(date, datatype=XSD.date)))

#         url = cr_doc['review_url']
#         url = url.replace(' ', '')
#         g.add((URIRef(prefix+uri), SCHEMA.url, URIRef(url)))

#         language = cr_doc['fact_checker']['language']
#         g.add((URIRef(prefix+uri), SCHEMA.inLanguage, Literal(language)))

#         uri_normalized_rating = 'rating/'+cr_doc['reviews'][0]['label']

#         g.add((URIRef(prefix+uri_normalized_rating), RDF.type, SCHEMA.Rating))
#         g.add((URIRef(prefix+uri_normalized_rating), SCHEMA.ratingValue, Literal(cr_doc['reviews'][0]['label'])))

#         g.add((URIRef(prefix+uri), CIMPLE.normalizedReviewRating, URIRef(prefix+uri_normalized_rating)))

#         uri_original_rating = 'rating/'+uri_generator(cr_doc['reviews'][0]['original_label'])

#         g.add((URIRef(prefix+uri_original_rating), RDF.type, SCHEMA.Rating))
#         g.add((URIRef(prefix+uri_original_rating), SCHEMA.ratingValue, Literal(cr['reviews'][0]['original_label'].replace('_', ' '))))

#         g.add((URIRef(prefix+uri), SCHEMA.reviewRating, URIRef(prefix+uri_original_rating)))


#         claim = cr_doc['claim_text'][0]
#         identifier_claim = 'claim'+claim
#         uri_claim = 'claim/'+uri_generator(identifier_claim)

#         #SCHEMA.Claim has not yet been integrated
#         #This term is proposed for full integration into Schema.org, pending implementation feedback and adoption from applications and websites. 
#         g.add((URIRef(prefix+uri_claim),RDF.type, SCHEMA.Claim))
        
#         text_claim = claim
#         text_claim = normalize_text(text_claim)
#         g.add((URIRef(prefix+uri_claim),SCHEMA.text, Literal(text_claim)))

#         g.add((URIRef(prefix+uri), SCHEMA.itemReviewed, URIRef(prefix+uri_claim)))

#         appearances = cr_doc['appearances']
#         for a in appearances:
#             identifier_appearance = 'appearance'+str(a)
#             uri_appearance = 'appearance/'+uri_generator(identifier_appearance)
#             g.add((URIRef(prefix+uri_appearance), RDF.type, SCHEMA.CreativeWork))
#             g.add((URIRef(prefix+uri_appearance), SCHEMA.url, URIRef(a)))
#             g.add((URIRef(prefix+uri_claim), SCHEMA.appearance, URIRef(prefix+uri_appearance)))

#         if prefix+uri in d_factors_cr:
#             factors_cr = d_factors_cr[prefix+uri_cr]

#             e = factors_cr[1]
#             if e != 'None':
#                 g.add((URIRef(prefix+uri_claim), CIMPLE.hasEmotion, URIRef(prefix+'emotion/'+str(e.lower()))))
#             b = factors_cr[2]
#             g.add((URIRef(prefix+uri_claim), CIMPLE.hasPoliticalLeaning, URIRef(prefix+'political-leaning/'+str(b.lower()))))
#             s = factors_cr[3]
#             g.add((URIRef(prefix+uri_claim), CIMPLE.hasSentiment, URIRef(prefix+'sentiment/'+str(s.lower()))))
#             cons_i = json.loads(factors_cr[4])
#             for k in range(0, len(cons_i)):
#                 if cons_i[k] == 1:
#                     c = CONSPIRACIES[k]
#                     g.add((URIRef(prefix+uri_claim), CIMPLE.mentionsConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
#                 elif cons_i[k] == 2:
#                     c = CONSPIRACIES[k]
#                     g.add((URIRef(prefix+uri_claim), CIMPLE.promotesConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))

#             entities = factors_cr[0]
#             if 'Resources' in entities:
#                 for e in entities['Resources']:
#                     dbpedia_url = e['@URI']
# #                     uri_mention = 'entity/'+uri_generator(dbpedia_url)
# #                     dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                

# #                     g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
# #                     e_types = e['@types'].split(',')
# #                     for t in e_types:
# #                         if "Wikidata" in t:
# #                             g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
# #                         if "DBpedia" in t:
# #                             g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

# #                     g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
# #                     g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

#                     g.add((URIRef(prefix+uri_claim), SCHEMA.mentions, URIRef(dbpedia_url)))


100%|█████████████████████████████████████████| 2543/2543 [00:02<00:00, 1213.90it/s]


In [201]:
factors_tweets = pd.read_csv(ct_path+'tweets_factors.csv')
emotions = factors_tweets['emotions'].tolist()
biases = factors_tweets['biases'].tolist()
sentiments = factors_tweets['sentiments'].tolist()
cons = factors_tweets['conspiracies'].tolist()
with open(ct_path+"entities_tweets.json", "r") as outfile:
    ents_tweets = json.load(outfile)
len(factors_tweets), len(ents_tweets)

(1199, 1199)

In [202]:
errors = []
for i in trange(0, len(df_tweets)):
        
    text = df_tweets['text'].tolist()[i]
    identifier = 'tweet'+text
    uri = 'tweet/'+uri_generator(identifier)
    g.add((URIRef(prefix+uri), RDF.type, SCHEMA.SocialMediaPosting))
    
    text=normalize_text(text)
    g.add((URIRef(prefix+uri), SCHEMA.text, Literal(text)))
    
    r = measure_readability(text)
    g.add((URIRef(prefix+uri), CIMPLE.readability_score, Literal(r)))
        
    e = emotions[i]
    if e != 'None':
        g.add((URIRef(prefix+uri), CIMPLE.hasEmotion, URIRef(prefix+'emotion/'+str(e.lower()))))
    s = sentiments[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasSentiment, URIRef(prefix+'sentiment/'+str(s.lower()))))
    b = biases[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasBias, URIRef(prefix+'bias/'+str(b.lower()))))
    cons_i = json.loads(cons[i])
    for k in range(0, len(cons_i)):
        if cons_i[k] == 1:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri), CIMPLE.mentionsConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
        elif cons_i[k] == 2:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri), CIMPLE.promotesConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
    
    entities = ents[i]
    if 'Resources' in entities:
        for ent in entities['Resources']:
            dbpedia_url = ent['@URI']
#             uri_mention = 'entity/'+uri_generator(dbpedia_url)
#             dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                
            
#             g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
#             e_types = e['@types'].split(',')
#             for t in e_types:
#                 if "Wikidata" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
#                 if "DBpedia" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

#             g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
#             g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

            g.add((URIRef(prefix+uri), SCHEMA.mentions, URIRef(dbpedia_url)))
    
    
    tweet_id = df_tweets['tweets'].tolist()[i]
    matches = df_match_full[df_match_full['tweets']==tweet_id]['vclaims'].tolist()
    for m in matches:
        row = df_articles[df_articles['vclaim_id']==m]
        if len(row)>0:
            if row['CR Doc'].tolist()[0]==row['CR Doc'].tolist()[0]:
                cr_doc = json.loads(row['CR Doc'].tolist()[0])
                identifier_article = 'claim-review'+cr_doc['claim_text'][0]+cr_doc['label']+cr_doc['review_url']
            else:
                identifier_article = 'claim-review'+row['vclaim_id']

            uri_article = 'claim-review/'+uri_generator(identifier_article)
            
            g.add((URIRef(prefix+uri), CIMPLE.related, URIRef(prefix+uri_article)))
            
            
    

100%|██████████████████████████████████████████| 1196/1196 [00:02<00:00, 562.86it/s]


In [203]:
len(g)

21859

In [204]:
g.serialize(destination="/data/peskine/KGs/check-that-v3.ttl")

<Graph identifier=N10e0bb02946a4685b6351255cb911d6d (<class 'rdflib.graph.Graph'>)>

## AFP

In [429]:
afp_path = '/data/peskine/AFP/'
df_afp = pd.read_csv(afp_path+'/AFP.csv')
df_afp

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,ProviderId,DateId,NewsItemId,RevisionId,PublicIdentifier,NewsItemType,FirstCreated,ThisRevisionCreated,Status,Urgency,AssociatedWith,DateLine,HeadLine,Creator,Language,SubjectCodes,Keywords,FirstParagraph,Body,Location
0,afp.com,20200505T190606Z,TX-PAR-KZM80,1,urn:newsml:afp.com:20200505T190606Z:TX-PAR-KZM...,,20200505T190606+0000,20200505T190606+0000,,,,"Monrovia, May 5, 2020 (AFP) -",Several dead in Liberian mine collapse,,,"['04012005', '03013000', '04012000', '04000000...","['Liberia', 'mining', 'accident']",Up to 50 people have died in a mine collapse i...,\nUp to 50 people have died in a mine collapse...,"[['Country', 'LBR'], ['City', 'Monrovia']]"
1,afp.com,20200505T205404Z,TX-PAR-KZO17,1,urn:newsml:afp.com:20200505T205404Z:TX-PAR-KZO...,,20200505T205404+0000,20200505T205404+0000,,,,"Paris, May 5, 2020 (AFP) -","Tennis establishes virus player fund, raises $...",,,"['15065000', '15000000', '15000000']","['tennis', 'health', 'virus', 'ATP', 'WTA', 'I...",A fund for tennis players hardest hit by the c...,\nA fund for tennis players hardest hit by the...,"[['Country', 'FRA'], ['City', 'Paris']]"
2,afp.com,20200505T115036Z,TX-PAR-KZE46,1,urn:newsml:afp.com:20200505T115036Z:TX-PAR-KZE...,,20200505T115036+0000,20200505T115036+0000,,,,"Brussels, May 5, 2020 (AFP) -",Brussels asserts primacy of EU law after Germa...,,,"['04008001', '11002000', '04008020', '11013000...","['ECB', 'eurozone', 'economy', 'Germany', 'cou...",The EU insisted that European law trumps that ...,\nThe EU insisted that European law trumps tha...,"[['Country', 'BEL'], ['City', 'Brussels']]"
3,afp.com,20200505T140217Z,TX-PAR-KZG96,1,urn:newsml:afp.com:20200505T140217Z:TX-PAR-KZG...,,20200505T140217+0000,20200505T140217+0000,,,,"Beijing, May 5, 2020 (AFP) -",China says launch of key new space rocket 'suc...,,,"['13008000', '13007000', '13006000', '13000000']","['China', 'space', 'launch']",China on Tuesday successfully launched a new r...,\nChina on Tuesday successfully launched a new...,"[['Country', 'CHN'], ['City', 'Beijing']]"
4,afp.com,20200505T085721Z,TX-PAR-KZB30,1,urn:newsml:afp.com:20200505T085721Z:TX-PAR-KZB...,,20200505T085721+0000,20200505T085721+0000,,,,"Hong Kong, May 5, 2020 (AFP) -",Hong Kong to lift major social restrictions as...,,,"['07000000', '07001002', '07001001', '07001000']","['Health', 'virus', 'HongKong', 'Carrie Lam']",Hong Kong on Tuesday announced plans to ease m...,\nHong Kong on Tuesday announced plans to ease...,"[['Country', 'CHN'], ['City', 'Hong Kong']]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252456,afp.com,20210127T093921Z,TX-PAR-SRO59,1,urn:newsml:afp.com:20210127T093921Z:TX-PAR-SRO...,,20210127T093921+0000,20210127T093921+0000,,,,"Sydney, Jan 27, 2021 (AFP) -",Australia ordered to pay migrants for privacy ...,,,"['02007001', '11007000', '02006000', '11000000...","['Australia', 'law', 'rights']",An Australian watchdog on Wednesday ordered th...,\nAn Australian watchdog on Wednesday ordered ...,"[['Country', 'AUS'], ['City', 'Sydney']]"
252457,afp.com,20210127T095533Z,TX-PAR-SRO83,1,urn:newsml:afp.com:20210127T095533Z:TX-PAR-SRO...,,20210127T095533+0000,20210127T095533+0000,,,,"Paris, Jan 27, 2021 (AFP) -",Napoleon's account of legendary Battle of Aust...,,,"['10000000', '13003002', '16009000', '01000000...","['France', 'history', 'Napoleon', 'Napoleon Bo...",Napoleon Bonaparte's account of his victory at...,\nNapoleon Bonaparte's account of his victory ...,"[['Country', 'FRA'], ['City', 'Paris']]"
252458,afp.com,20210127T183628Z,TX-PAR-SSD41,1,urn:newsml:afp.com:20210127T183628Z:TX-PAR-SSD...,,20210127T183628+0000,20210127T183628+0000,,,,"New York, Jan 27, 2021 (AFP) -",US Open revives qualifying for showdown at Tor...,,,"['15027000', '15000000', '15000000']","['Golf', 'USPGA', 'Open', 'USA']",The US Open will return to a qualifying format...,\nThe US Open will return to a qualifying form...,"[['Country', 'USA'], ['Area', 'State of New Yo..."
252459,afp.com,20210127T175934Z,TX-PAR-SSC35,1,urn:newsml:afp.com:20210127T175934Z:TX-PAR-SSC...,,20210127T175934+0000,20210127T175934+0000,,,,"Yaoundé, Jan 27, 2021 (AFP) -",53 killed in Cameroon bus blaze,,,"['03013000', '04015000', '04000000', '04000000...","['Cameroon', 'accident', 'transport', 'bus']",Fifty-three people died Wednesday when a bus c...,\nFifty-three people died Wednesday when a bus...,"[['Country', 'CMR'], ['Area', 'Région du Centr..."


In [430]:
dates = df_afp['DateId']
dates = [datetime.strptime(d[:8], "%Y%m%d") for d in dates]
df_afp['dates'] = dates
df_afp['dates']

0        2020-05-05
1        2020-05-05
2        2020-05-05
3        2020-05-05
4        2020-05-05
            ...    
252456   2021-01-27
252457   2021-01-27
252458   2021-01-27
252459   2021-01-27
252460   2021-01-27
Name: dates, Length: 252461, dtype: datetime64[ns]

In [431]:
df_afp = df_afp.loc[(df_afp['dates'] >= start_date) & (df_afp['dates'] <= end_date)]
df_afp

Unnamed: 0,ProviderId,DateId,NewsItemId,RevisionId,PublicIdentifier,NewsItemType,FirstCreated,ThisRevisionCreated,Status,Urgency,...,DateLine,HeadLine,Creator,Language,SubjectCodes,Keywords,FirstParagraph,Body,Location,dates
0,afp.com,20200505T190606Z,TX-PAR-KZM80,1,urn:newsml:afp.com:20200505T190606Z:TX-PAR-KZM...,,20200505T190606+0000,20200505T190606+0000,,,...,"Monrovia, May 5, 2020 (AFP) -",Several dead in Liberian mine collapse,,,"['04012005', '03013000', '04012000', '04000000...","['Liberia', 'mining', 'accident']",Up to 50 people have died in a mine collapse i...,\nUp to 50 people have died in a mine collapse...,"[['Country', 'LBR'], ['City', 'Monrovia']]",2020-05-05
1,afp.com,20200505T205404Z,TX-PAR-KZO17,1,urn:newsml:afp.com:20200505T205404Z:TX-PAR-KZO...,,20200505T205404+0000,20200505T205404+0000,,,...,"Paris, May 5, 2020 (AFP) -","Tennis establishes virus player fund, raises $...",,,"['15065000', '15000000', '15000000']","['tennis', 'health', 'virus', 'ATP', 'WTA', 'I...",A fund for tennis players hardest hit by the c...,\nA fund for tennis players hardest hit by the...,"[['Country', 'FRA'], ['City', 'Paris']]",2020-05-05
2,afp.com,20200505T115036Z,TX-PAR-KZE46,1,urn:newsml:afp.com:20200505T115036Z:TX-PAR-KZE...,,20200505T115036+0000,20200505T115036+0000,,,...,"Brussels, May 5, 2020 (AFP) -",Brussels asserts primacy of EU law after Germa...,,,"['04008001', '11002000', '04008020', '11013000...","['ECB', 'eurozone', 'economy', 'Germany', 'cou...",The EU insisted that European law trumps that ...,\nThe EU insisted that European law trumps tha...,"[['Country', 'BEL'], ['City', 'Brussels']]",2020-05-05
3,afp.com,20200505T140217Z,TX-PAR-KZG96,1,urn:newsml:afp.com:20200505T140217Z:TX-PAR-KZG...,,20200505T140217+0000,20200505T140217+0000,,,...,"Beijing, May 5, 2020 (AFP) -",China says launch of key new space rocket 'suc...,,,"['13008000', '13007000', '13006000', '13000000']","['China', 'space', 'launch']",China on Tuesday successfully launched a new r...,\nChina on Tuesday successfully launched a new...,"[['Country', 'CHN'], ['City', 'Beijing']]",2020-05-05
4,afp.com,20200505T085721Z,TX-PAR-KZB30,1,urn:newsml:afp.com:20200505T085721Z:TX-PAR-KZB...,,20200505T085721+0000,20200505T085721+0000,,,...,"Hong Kong, May 5, 2020 (AFP) -",Hong Kong to lift major social restrictions as...,,,"['07000000', '07001002', '07001001', '07001000']","['Health', 'virus', 'HongKong', 'Carrie Lam']",Hong Kong on Tuesday announced plans to ease m...,\nHong Kong on Tuesday announced plans to ease...,"[['Country', 'CHN'], ['City', 'Hong Kong']]",2020-05-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252456,afp.com,20210127T093921Z,TX-PAR-SRO59,1,urn:newsml:afp.com:20210127T093921Z:TX-PAR-SRO...,,20210127T093921+0000,20210127T093921+0000,,,...,"Sydney, Jan 27, 2021 (AFP) -",Australia ordered to pay migrants for privacy ...,,,"['02007001', '11007000', '02006000', '11000000...","['Australia', 'law', 'rights']",An Australian watchdog on Wednesday ordered th...,\nAn Australian watchdog on Wednesday ordered ...,"[['Country', 'AUS'], ['City', 'Sydney']]",2021-01-27
252457,afp.com,20210127T095533Z,TX-PAR-SRO83,1,urn:newsml:afp.com:20210127T095533Z:TX-PAR-SRO...,,20210127T095533+0000,20210127T095533+0000,,,...,"Paris, Jan 27, 2021 (AFP) -",Napoleon's account of legendary Battle of Aust...,,,"['10000000', '13003002', '16009000', '01000000...","['France', 'history', 'Napoleon', 'Napoleon Bo...",Napoleon Bonaparte's account of his victory at...,\nNapoleon Bonaparte's account of his victory ...,"[['Country', 'FRA'], ['City', 'Paris']]",2021-01-27
252458,afp.com,20210127T183628Z,TX-PAR-SSD41,1,urn:newsml:afp.com:20210127T183628Z:TX-PAR-SSD...,,20210127T183628+0000,20210127T183628+0000,,,...,"New York, Jan 27, 2021 (AFP) -",US Open revives qualifying for showdown at Tor...,,,"['15027000', '15000000', '15000000']","['Golf', 'USPGA', 'Open', 'USA']",The US Open will return to a qualifying format...,\nThe US Open will return to a qualifying form...,"[['Country', 'USA'], ['Area', 'State of New Yo...",2021-01-27
252459,afp.com,20210127T175934Z,TX-PAR-SSC35,1,urn:newsml:afp.com:20210127T175934Z:TX-PAR-SSC...,,20210127T175934+0000,20210127T175934+0000,,,...,"Yaoundé, Jan 27, 2021 (AFP) -",53 killed in Cameroon bus blaze,,,"['03013000', '04015000', '04000000', '04000000...","['Cameroon', 'accident', 'transport', 'bus']",Fifty-three people died Wednesday when a bus c...,\nFifty-three people died Wednesday when a bus...,"[['Country', 'CMR'], ['Area', 'Région du Centr...",2021-01-27


In [432]:
df = pd.read_csv(afp_path+'factors.csv')
emotions = df['emotions'].tolist()
biases = df['biases'].tolist()
sentiments = df['sentiments'].tolist()
cons = df['conspiracies'].tolist()
with open(afp_path+"entities.json", "r") as outfile:
    ents = json.load(outfile)
len(df), len(ents), len(df_afp)

(193933, 193933, 193933)

### AFP - Graph

In [439]:
g = Graph()

In [149]:
for i in trange(0, len(df_afp)):
    
    row = df_afp.iloc[i, :]
    
    identifier = 'news-article'+str(row['PublicIdentifier'])
    uri = 'news-article'+uri_generator(identifier)
    g.add((URIRef(prefix+uri), RDF.type, SCHEMA.NewsArticle))
    
    identifier_afp = 'organization'+'AFP'
    uri_afp = 'organization/'+uri_generator(identifier_afp)
    g.add((URIRef(prefix+uri), SCHEMA.author, URIRef(prefix+uri_afp)))
    
    date = datetime.strptime(row['DateId'][:8], "%Y%m%d")
    if date == date:
        g.add((URIRef(prefix+uri), SCHEMA.dateline, Literal(date.strftime('%Y-%m-%d'), datatype=XSD.date)))
    
    headline = row['HeadLine']
    if headline==headline:
        if "News Advisory" not in headline:
            headline=normalize_text(headline)
            g.add((URIRef(prefix+uri), SCHEMA.headline, Literal(headline)))
    else:
        headline=""
        
    body = row['Body']
    if body==body:
        body = normalize_text(body)
        g.add((URIRef(prefix+uri), SCHEMA.articleBody, Literal(body)))
    else:
        body = ""
        
    r = measure_readability(headline+'\n'+body)
    g.add((URIRef(prefix+uri), CIMPLE.readability_score, Literal(r)))
        
    e = emotions[i]
    if e != 'None':
        g.add((URIRef(prefix+uri), CIMPLE.hasEmotion, URIRef(prefix+'emotion/'+str(e.lower()))))
    s = sentiments[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasSentiment, URIRef(prefix+'sentiment/'+str(s.lower()))))
    b = biases[i]
    g.add((URIRef(prefix+uri), CIMPLE.hasPoliticalLeaning, URIRef(prefix+'political-leaning/'+str(b.lower()))))
    cons_i = json.loads(cons[i])
    for k in range(0, len(cons_i)):
        if cons_i[k] == 1:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri), CIMPLE.mentionsConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
        elif cons_i[k] == 2:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri), CIMPLE.promotesConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
    
    entities = ents[i]
    if 'Resources' in entities:
        for ent in entities['Resources']:
            dbpedia_url = ent['@URI']
#             uri_mention = 'entity/'+uri_generator(dbpedia_url)
#             dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                
            
#             g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
#             e_types = e['@types'].split(',')
#             for t in e_types:
#                 if "Wikidata" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
#                 if "DBpedia" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

#             g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
#             g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

            g.add((URIRef(prefix+uri), SCHEMA.mentions, URIRef(dbpedia_url)))


NameError: name 'df_afp' is not defined

In [441]:
len(g)

7409318

In [442]:
g.serialize(destination="/data/peskine/KGs/AFP-v2.ttl")

<Graph identifier=N87888a9667e449d2a002efe4604f10a4 (<class 'rdflib.graph.Graph'>)>

## Claim Reviews

In [205]:
cr_path = '/data/peskine/ClaimReviews20230510/'
claim_reviews_full = json.load(io.open(cr_path+'claim_reviews.json'))

claim_reviews_full

[{'claim_text': ['Misleading claim circulates in South Korea that military mail-in ballots for US presidential election found in a dumpster in PA were all cast for Trump'],
  'label': 'not_verifiable',
  'review_url': 'https://factcheck.afp.com/korean-social-media-posts-share-misleading-claim-about-ballots-found-trash-pennsylvania',
  'fact_checker': {'name': 'AFP fact checking',
   'country': 'France',
   'language': 'French',
   'website': 'https://www.afp.com/en',
   'ifcn_url': 'https://ifcncodeofprinciples.poynter.org/profile/afp-fact-checking',
   'avatar': 'https://ifcncodeofprinciples.poynter.org/storage/logos/afp-fact-checking_logo.png?v=1612201098',
   'domain': 'afp.com'},
  'appearances': ['https://www.facebook.com/permalink.php?id=100001539747977&story_fbid=3636842506377038'],
  'reviews': [{'label': 'not_verifiable',
    'original_label': 'Missing context',
    'review_rating': {'@type': 'Rating',
     'ratingValue': '2.5',
     'bestRating': '5',
     'worstRating': '1',

In [206]:
claim_reviews_time_range = []

for cr in claim_reviews_full:
    d = cr['reviews'][0]['date_published']
    
    if d and datetime.strptime(d, '%Y-%m-%d') >start_date and datetime.strptime(d, '%Y-%m-%d') < end_date:
        claim_reviews_time_range.append(cr)


claim_reviews = claim_reviews_time_range
len(claim_reviews)


43161

In [207]:
cr_en = []
for i in trange(0, len(claim_reviews)):
    text = claim_reviews[i]['claim_text'][0]
    try:
        code = Detector(text).language.code
        if code =='en':
            cr_en.append(claim_reviews[i])
    except:
        pass
claim_reviews = cr_en

100%|██████████████████████████████████████| 43161/43161 [00:00<00:00, 56594.54it/s]


In [208]:
df = pd.read_csv(cr_path+'factors.csv')
emotions = df['emotions'].tolist()
biases = df['biases'].tolist()
sentiments = df['sentiments'].tolist()
cons = df['conspiracies'].tolist()
with open(cr_path+"entities.json", "r") as outfile:
    ents = json.load(outfile)
len(df), len(ents)

(17996, 17996)

In [209]:
d_factors_cr = {}
for i in trange(0, len(claim_reviews)):
    cr = claim_reviews[i]
    identifier_cr = 'claim-review'+cr['claim_text'][0]+cr['label']+cr['review_url']
    uri = prefix+uri_generator(identifier_cr)
    
    d_factors_cr[uri] = [ents[i], emotions[i], biases[i], sentiments[i], cons[i]]

100%|█████████████████████████████████████| 17996/17996 [00:00<00:00, 216584.87it/s]


### CR - Graph

In [210]:
g = Graph()

In [211]:
URL_AVAILABLE_CHARS = """ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="""

for i in trange(0, len(claim_reviews)):
    cr_doc = claim_reviews[i]
    
    identifier = 'claim-review'+cr_doc['claim_text'][0]+cr_doc['label']+cr_doc['review_url']
    uri = 'claim-review'+uri_generator(identifier)
    g.add((URIRef(prefix+uri), RDF.type, SCHEMA.ClaimReview))
    
    author = cr_doc['fact_checker']['name']
    website = cr_doc['fact_checker']['website']
    identifier_author = 'organization'+str(author)
    uri_author = 'organization/'+uri_generator(identifier_author)
    
    g.add((URIRef(prefix+uri_author), RDF.type, SCHEMA.Organization))
    g.add((URIRef(prefix+uri_author), SCHEMA.name, Literal(author)))
    g.add((URIRef(prefix+uri_author), SCHEMA.url, URIRef(website)))

    g.add((URIRef(prefix+uri), SCHEMA.author, URIRef(prefix+uri_author)))

    date = cr_doc['reviews'][0]['date_published']
    g.add((URIRef(prefix+uri), SCHEMA.datePublished, Literal(date, datatype=XSD.date)))
    
    url = cr_doc['review_url']
    url = url.replace(' ', '')
    g.add((URIRef(prefix+uri), SCHEMA.url, URIRef(url)))
    
    language = cr_doc['fact_checker']['language']
    g.add((URIRef(prefix+uri), SCHEMA.inLanguage, Literal(language)))
    
    uri_normalized_rating = 'rating/'+cr_doc['reviews'][0]['label']
    g.add((URIRef(prefix+uri), CIMPLE.normalizedReviewRating, URIRef(prefix+uri_normalized_rating)))
    
    uri_original_rating = 'rating/'+uri_generator('rating'+cr_doc['reviews'][0]['original_label'])
    g.add((URIRef(prefix+uri), SCHEMA.reviewRating, URIRef(prefix+uri_original_rating)))

    
    claim = cr_doc['claim_text'][0]
    identifier_claim = 'claim'+claim
    uri_claim = 'claim/'+uri_generator(identifier_claim)
    
    #SCHEMA.Claim has not yet been integrated
    #This term is proposed for full integration into Schema.org, pending implementation feedback and adoption from applications and websites. 
    g.add((URIRef(prefix+uri_claim),RDF.type, SCHEMA.Claim))
    
    g.add((URIRef(prefix+uri), SCHEMA.itemReviewed, URIRef(prefix+uri_claim)))

    text = claim
    text = normalize_text(text)
    g.add((URIRef(prefix+uri_claim),SCHEMA.text, Literal(text)))
    
    appearances = cr_doc['appearances']
    for a in appearances:
        if a != None:
#             identifier_appearance = 'appearance'+str(a)
#             uri_appearance = 'appearance/'+uri_generator(identifier_appearance)
#             g.add((URIRef(prefix+uri_appearance), RDF.type, SCHEMA.CreativeWork))
#             g.add((URIRef(prefix+uri_appearance), SCHEMA.url, URIRef(a)))
            b = ''.join([i for i in a if i in URL_AVAILABLE_CHARS])
            g.add((URIRef(prefix+uri_claim), SCHEMA.appearance, URIRef(b)))

    
    
    r = measure_readability(text)
    g.add((URIRef(prefix+uri_claim), CIMPLE.readability_score, Literal(r)))

    e = emotions[i]
    if e != 'None':
        g.add((URIRef(prefix+uri_claim), CIMPLE.hasEmotion, URIRef(prefix+'emotion/'+str(e.lower()))))
    s = sentiments[i]
    g.add((URIRef(prefix+uri_claim), CIMPLE.hasSentiment, URIRef(prefix+'sentiment/'+str(s.lower()))))
    b = biases[i]
    g.add((URIRef(prefix+uri_claim), CIMPLE.hasPoliticalLeaning, URIRef(prefix+'political-leaning/'+str(b.lower()))))
    cons_i = json.loads(cons[i])
    for k in range(0, len(cons_i)):
        if cons_i[k] == 1:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri_claim), CIMPLE.mentionsConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
        elif cons_i[k] == 2:
            c = CONSPIRACIES[k]
            g.add((URIRef(prefix+uri_claim), CIMPLE.promotesConspiracy, URIRef(prefix+'conspiracy/'+str(c.replace(' ', '_').lower()))))
    
    entities = ents[i]
    if 'Resources' in entities:
        for ent in entities['Resources']:
            dbpedia_url = ent['@URI']
#             uri_mention = 'entity/'+uri_generator(dbpedia_url)
#             dbpedia_name = dbpedia_url.split('/')[-1].replace('_', ' ')                
            
#             g.add((URIRef(prefix+uri_mention), RDF.type, SCHEMA.Thing))
#             e_types = e['@types'].split(',')
#             for t in e_types:
#                 if "Wikidata" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(WIKI_prefix+t.split(':')[1])))
#                 if "DBpedia" in t:
#                     g.add((URIRef(prefix+uri_mention), RDF.type, URIRef(DB_prefix+t.split(':')[1])))

#             g.add((URIRef(prefix+uri_mention), SCHEMA.url, URIRef(dbpedia_url)))
#             g.add((URIRef(prefix+uri_mention), SCHEMA.name, Literal(dbpedia_name)))

            g.add((URIRef(prefix+uri_claim), SCHEMA.mentions, URIRef(dbpedia_url)))
    

100%|███████████████████████████████████████| 17996/17996 [00:14<00:00, 1247.62it/s]


In [212]:
g.serialize(destination="/data/peskine/KGs/claim-review-v3.ttl")

<Graph identifier=Ne031d902ea4f4f6ca7f38d68ad7592f3 (<class 'rdflib.graph.Graph'>)>

# Factors

## Readability

In [16]:
def measure_readability(text):
    # The min and max values are defined based on our observation of the data.
    # they could  change in accordance with the dataset circumstances

    # Calculating the readability scores
    value_FRES = measure_FRES(text)

    value_FKGL = measure_FKGL(text)
    value_GFI = measure_GFI(text)
    value_ARI = measure_ARI(text)
    value_DCRS = measure_DCRS(text)
    value_SRF = measure_SRF(text)
    scores = [value_FKGL, value_FKGL, value_GFI, value_ARI, value_DCRS, value_SRF]

    # Averaging the scores
    avg_score = statistics.mean(scores)

    # Normalising the scores to the range 0-100
    min_score = 1
    max_score = 15
    normalised_avg_score = round((avg_score - min_score) / (max_score - min_score) * 100)

    return normalised_avg_score


def measure_FRES(text):
    # Calculate the value of FRES for the given text
    value = round(textstat.flesch_reading_ease(text))

    # Convert the obtained value to the standard score scale (US educational level)
    if value >= 90:
        score = 5
    elif 80 <= value < 90:
        score = 6
    elif 70 <= value < 80:
        score = 7
    elif 65 <= value < 70:
        score = 8
    elif 60 <= value < 65:
        score = 9
    elif 57 <= value < 60:
        score = 10
    elif 54 <= value < 57:
        score = 11
    elif 50 <= value < 54:
        score = 12
    elif 30 <= value < 50:
        score = 13
    elif 10 <= value < 30:
        score = 14
    elif value < 10:
        score = 15

    return score


def measure_FKGL(text):
    # Calculate the value of FRES for the given text
    value = round(textstat.flesch_kincaid_grade(text))
    score = normalise_value_FKGL_ARI_SRF(value)

    return score


def measure_GFI(text):
    # Calculate the value of FRES for the given text
    value = round(textstat.gunning_fog(text))
    # Convert the obtained value to the standard score scale (US educational level)
    if value < 7:
        score = 6
    elif 13 <= value < 17:
        score = 13
    elif value > 17:
        score = 15
    else:
        score = value

    return score


def measure_ARI(text):
    # Calculate the value of FRES for the given text
    value = round(textstat.automated_readability_index(text))
    score = normalise_value_FKGL_ARI_SRF(value)

    return score


def measure_DCRS(text):
    # Calculate the value of FRES for the given text
    value = round(textstat.dale_chall_readability_score(text), 1)
    if value < 5:
        score = 4
    elif 5 <= value < 5.5:
        score = 5
    elif 5.5 <= value < 6:
        score = 6
    elif 6 <= value < 6.5:
        score = 7
    elif 6.5 <= value < 7:
        score = 8
    elif 7 <= value < 7.5:
        score = 9
    elif 7.5 <= value < 8:
        score = 10
    elif 8 <= value < 8.5:
        score = 11
    elif 8.5 <= value < 9:
        score = 12
    elif 9 <= value < 9.5:
        score = 13
    elif 9.5 <= value < 10:
        score = 14
    elif value >= 10:
        score = 15
    return score


def measure_SRF(text):
    # Calculate the value of FRES for the given text
    value = round(textstat.spache_readability(text))
    score = normalise_value_FKGL_ARI_SRF(value)

    return score


def normalise_value_FKGL_ARI_SRF(value):
    # Convert the obtained value to the standard score scale (US educational level)
    if value < 2:
        score = 1
    elif value > 14:
        score = 15
    else:
        score = value

    return score


## Cuda

In [12]:
class CovidTwitterBertClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        self.bert.cls.seq_relationship = nn.Linear(1024, n_classes)

        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, token_type_ids, input_mask):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)

        logits = outputs[1]
        
        return logits  

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [14]:
tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

## Emotion

In [15]:
model_em = CovidTwitterBertClassifier(5)
model_em.load_state_dict(torch.load('/data/peskine/covid-latent/models/emotion_undersampling_CV0_e2_0.622.pth'))
model_em.eval()
model_em.to(device)


CovidTwitterBertClassifier(
  (bert): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_features=1

## Political leaning

In [16]:
model_pol = CovidTwitterBertClassifier(3)
model_pol.load_state_dict(torch.load('/data/peskine/russian-troll-tweets/models/politics_undersampling_CV0_e24_0.636.pth'))
model_pol.eval()
model_pol.to(device)


CovidTwitterBertClassifier(
  (bert): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_features=1

## Sentiment

In [17]:
model_sent = CovidTwitterBertClassifier(3)
model_sent.load_state_dict(torch.load('/data/peskine/COVIDSenti/models/senti_CV0_e21_0.769.pth'))
model_sent.eval()
model_sent.to(device)


CovidTwitterBertClassifier(
  (bert): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_features=1

## Conspiracies

In [18]:
model_con = CovidTwitterBertClassifier(27)
model_con.load_state_dict(torch.load('/data/peskine/mediaeval22/models/task1_twitter_CV3_e21_0.731.pth'))
model_con.eval()
model_con.to(device)


CovidTwitterBertClassifier(
  (bert): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=1024, out_features=1

## Compute factors

In [24]:
def compute_factors(text, compute_emotion=True, compute_political_leaning=True, compute_sentiment=True, compute_conspiracy_theories=True, compute_readability=True):
    emotion, political_bias, sentiment, conspiracy, readability = None, None, None, None, None
    
    MAX_LEN = 128 # < m some tweets will be truncated
    EMOTIONS = ['None', 'Happiness', 'Anger', 'Sadness', 'Fear']
    POLITICAL_BIAS = ['Left', 'Other', 'Right']
    SENTIMENTS = ['Negative', 'Neutral', 'Positive']
    CONSPIRACIES = ['Suppressed Cures', 'Behaviour and mind Control', 'Antivax', 'Fake virus', 'Intentional Pandemic', 'Harmful Radiation', 'Population Reduction', 'New World Order', 'Satanism']
    CONSPIRACY_LEVELS = ['No ', 'Mentioning ', 'Supporting ']
    
    a = tokenizer([text], max_length=MAX_LEN, padding='max_length', truncation=True)
    input_ids = torch.tensor(a['input_ids']).to(device)
    token_type_ids = torch.tensor(a['token_type_ids']).to(device)
    attention_mask = torch.tensor(a['attention_mask']).to(device)
    with torch.no_grad():
        if compute_emotion:
            logits_em = model_em(input_ids, token_type_ids, attention_mask)
        if compute_political_leaning:
            logits_pol = model_pol(input_ids, token_type_ids, attention_mask)
        if compute_sentiment:
            logits_sent = model_sent(input_ids, token_type_ids, attention_mask)
        if compute_conspiracy_theories:
            logits_con = model_con(input_ids, token_type_ids, attention_mask)
        
    emotion = EMOTIONS[logits_em.detach().cpu().numpy()[0].argmax()]
    political_bias = POLITICAL_BIAS[logits_pol.detach().cpu().numpy()[0].argmax()]
    sentiment = SENTIMENTS[logits_sent.detach().cpu().numpy()[0].argmax()]
    conspiracy = [logits_con.detach().cpu().numpy()[0][3*i:3*i+3].argmax() for i in range(0, 9)]
    conspiracies = []
    for i in range(0, 9):
        conspiracies.append(CONSPIRACY_LEVELS[conspiracy[i]]+CONSPIRACIES[i])
        
    if compute_readability:
        readability = measure_readability(text)
    return emotion, political_bias, sentiment, conspiracy, readability


# Entities

In [18]:
API_URL = "https://api.dbpedia-spotlight.org/en/annotate"

def extract_dbpedia_entities(s):
    if not s in ['', ' ', '   ']:
        payload = {'text': s}
        a = requests.post(API_URL, 
                     headers={'accept': 'application/json'},
                     data=payload).json()
        return a
    else:
        return []
    
