In [1]:
import warnings
warnings.filterwarnings('ignore')
from pyemd import emd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,  precision_recall_curve
from gensim.models.keyedvectors import KeyedVectors
from nltk.stem.snowball import SpanishStemmer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from datetime import datetime, timedelta
import lxml.etree as ET
import seaborn as sns
import pandas as pd
import numpy as np
import regex as re
import itertools
import unidecode
import spacy
import html
import os
import networkx as nx

In [2]:
NEWS_PATH = 'data/news/'

In [3]:
def listdir_checked(path, unwanted = ['.DS_Store']):
    '''
    Discard unwanted files or directories when listing the elements in a given path
    '''
    return (f for f in os.listdir(path) if f not in unwanted)


def normalize_string(to_normalize, encoded = False):
    '''
    Normalize text given a string
    '''
    text = str(to_normalize).lower()  # lowering text
    if encoded: 
        text = ' '.join([html.unescape(term) for term in text.split()])
    text = unidecode.unidecode(text)

    text = re.sub(r'[^\w\s]', '', text)  # removing all the punctuations
    last_text = text.split()  # tokenize the text

    # remove stopwords
    stopwords_set = set(stopwords.words("spanish"))
    stopwords_set = stopwords_set.union(set(["name"]))
    
    last_text = ' '.join([x for x in last_text if (x not in stopwords_set)])
    return last_text


def classify_entities(doc):
    '''
    Given an nlp doc, returns:
    * A dictonary with the entities grouped by type: 'PER', 'LOC', 'ORG' and 'MISC'
    * A list with all the entities
    '''
    classif_dict = {}
    classif_list = []
    for ent in doc.ents:
        try:
            classif_list.append(ent)
            classif_dict[ent.label_].append(ent)
        except:
            classif_dict[ent.label_] = [ent]
    # Check that the dict has all the keys
    for key in ['PER', 'LOC', 'ORG', 'MISC']:
        if key not in classif_dict.keys():
            classif_dict[key] = None

    return classif_dict, classif_list


def create_articles_dictionary(NEWS_PATH):
    '''
    Import articles information.
    Articles are stored in directories in the NEWS_PATH.
    '''
    data = {}               # keys: media, value: list of dictionaries with info about the news articles of the given media
    unique_urls = []        # list to store unique urls to discard repeated ones
    repeated_data = {}      # store repeated articles following the same format as 'data' dictionary


    for year in listdir_checked(NEWS_PATH):
            for month in listdir_checked(NEWS_PATH + '/' + year):
                    for file in listdir_checked(NEWS_PATH + '/' + year + '/' + month):
                        try:
                            full_path = NEWS_PATH + '/' + year + '/' + month + '/' + file
                            # Read xml file - info stored following NewsML-G2 format
                            root = ET.parse(full_path).getroot()
                            # Parse news
                            media = file.rsplit('_', 1)[0]
                            # Check repeated urls
                            url = root.findall(".//infoSource")[0].get("uri")
                            str_date = root.findall('.//contentMeta')[0].find('contentCreated').text[:10]
                            info = {
                                'id': file.split(':')[-1].replace('.xml', ''),
                                'media': media,
                                'publication_date': datetime.strptime(str_date, '%Y-%m-%d'),
                                'title': normalize_string(root.findall('.//itemRef')[0].find('title').text, encoded = True),
                                'headline': normalize_string(root.findall(".//itemRef")[0].find('description').text.strip(), encoded = True),
                                'article': normalize_string(root.findall('.//itemRef')[1].find('description').text.strip(), encoded = True),                'url': url
                            }

                            if url not in unique_urls:
                                unique_urls.append(url)
                                try:
                                    data[media].append(info)
                                except:
                                    data[media] = [info]

                            else:
                                try:
                                    repeated_data[media].append(info)
                                except:
                                    repeated_data[media] = [info]
                        except:
                            pass
                        
    return data, repeated_data


def load_elements(data):
    '''
    Load auxiliary variables:
    * mapping_keys - dict with key: tweet id -> value: absolute position to manage matrices
    * mapping_tweets - dict with key: value -> tweet_is: absolute position to manage matrices
    '''
    mapping_keys = {}  # key: tweet id -> value: absolute position in all_entities
    mapping_tweets = {}  # key: value -> tweet_id: absolute position in all_entities
    counter = 0

    for media, new in data.items():
        print('------ MEDIA:', media)
        for element in new:
            mapping_keys[element['id']] = counter
            mapping_tweets[counter] = element['id']

            counter += 1
            
    return mapping_keys, mapping_tweets


def cosine_similarity_matrix(data, mapping_keys, pairs_df, mapping_tweets):
    '''
    Compute cosine similarity of f-idf vectors
    '''
    
    # GET NEWS ARTICLES
    print('---GETTING NEWS')
    N = len(mapping_keys)
    articles = [None] * N
    for media, new in data.items():
        for element in new:
            try:
                pos = mapping_keys[element['id']]
                
                articles[pos] = normalize_string(element['title'] + ' ' + 
                                                 element['headline'] + ' ' + 
                                                 element['article'])
            except:
                pass

    # COMPUTE TF-IDF
    # Stem
    print('---STEM')
    stemmer = SpanishStemmer(ignore_stopwords=False)
    for i, article in enumerate(articles):
        try:
            articles[i] = str([stemmer.stem(word) for word in article.split()])
        except:
            articles[i] = 'null'
            pass
        
    # Compute tf-idf
    print('---COMPUTE TF-IDF')  
    stopwords_spanish = [word.encode().decode('utf-8') for word in stopwords.words('spanish')] # Remove stopwords
    vectorizer = TfidfVectorizer(stop_words=stopwords_spanish)
    X = vectorizer.fit_transform(articles)

    # COMPUTE COSINE SIMILARITY MATRIX
    print('---COMPUTE COSINE SIMILARITY MATRIX')
    cosine_sim_matrix = np.zeros(shape=(N, N))
    for row in pairs_df.iterrows():
        
        i = mapping_keys[str(id_A)]
        j = mapping_keys[str(id_B)]

        similarity = cosine_similarity(X[i], X[j])[0][0]
        cosine_sim_matrix[i, j] = 1-similarity
        cosine_sim_matrix[j, i] = 1-similarity

    return cosine_sim_matrix


def add_similarity_column(pairs_df, mapping_keys, similarity_matrix, column_name):
    '''
    Arguments:
     * pairs_df
         - pd.DataFrame
         - contains pairs of tweet_ids --> column names: [tweet_id_A, tweet_id_B]
     * mapping_keys
         - dictionary
         - key: tweet_id, value: position
     * similarity_matrix
         - np.matrix
         - symmetrical matrix with the similarity between tweets
     * column_name
         - string
         - name of the new column to be added in pairs_df
    '''
    similarity_pairs = []               # Create list with the same order as pairs_df
    for i, row in pairs_df.iterrows():
        tid_A = row['tweet_id_A']       # Obtain tweet id
        tid_B = row['tweet_id_B']       # Get the position of each article in the matrix
        pos_A = mapping_keys[tid_A]
        pos_B = mapping_keys[tid_B]
        similarity_pairs.append(similarity_matrix[pos_A, pos_B])             # Order similarity following pairs_df order
    pairs_df.insert(len(pairs_df.columns), column_name, similarity_pairs)   # Add new column
    return pairs_df

def componentes_conexas(G):
    '''
    Given a graph return a dictionary with the connected components
    '''
    
    def visita(v): 
        cc[v] = componente_actual 
        for u in G[v]:
            if u not in cc: 
                visita(u)
    cc = dict() 
    componente_actual = 0
    for v in G:
        if v not in cc:
            visita(v)
            componente_actual += 1
    return cc

# 1. Load Data

In [4]:
data, repeated_data = create_articles_dictionary(NEWS_PATH)

In [5]:
#CONSTRUCT A DATAFRAME WITH ALL THE ARTICLES AND THE INFORMATION NEEDED
articles_df = pd.DataFrame()

for media in data.keys():
    for new in data[media]:
        tweet_id = new['id']
        title = normalize_string(new['title'])
        headline = normalize_string(new['headline'])
        url = new['url']
        publication_date = new['publication_date']
        article = normalize_string(new['article'])
        
        articles_df = articles_df.append({'tweet_id': tweet_id, 'media': media, 'title': title, 
                                          'headline': headline, 'url': url, 'publication_date': publication_date,
                                          'article': article},
                                       ignore_index=True)

In [6]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14961 entries, 0 to 14960
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   tweet_id          14961 non-null  object        
 1   media             14961 non-null  object        
 2   title             14961 non-null  object        
 3   headline          14961 non-null  object        
 4   url               14961 non-null  object        
 5   publication_date  14961 non-null  datetime64[ns]
 6   article           14961 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 818.3+ KB


In [8]:
mapping_keys, mapping_tweets = load_elements(data)

------ MEDIA: 20m
------ MEDIA: okdiario
------ MEDIA: LaVanguardia
------ MEDIA: europapress
------ MEDIA: larazon_es
------ MEDIA: abc_es
------ MEDIA: elperiodico
------ MEDIA: elmundoes
------ MEDIA: publico_es
------ MEDIA: rtve
------ MEDIA: el_pais
------ MEDIA: La_SER
------ MEDIA: informativost5
------ MEDIA: ElHuffPost
------ MEDIA: voz_populi
------ MEDIA: COPE
------ MEDIA: elconfidencial
------ MEDIA: elespanolcom
------ MEDIA: eldiarioes
------ MEDIA: laSextaTV
------ MEDIA: OndaCero_es
------ MEDIA: noticias_cuatro
------ MEDIA: libertaddigital
------ MEDIA: EFEnoticias_ES


# 2. Determine parameters epsilon and T

In [9]:
#Select 40 random articles
k1_articles = articles_df.sample(n=40)
k1_articles

Unnamed: 0,tweet_id,media,title,headline,url,publication_date,article
2548,1455856028008943617,LaVanguardia,cree joven denuncio agresion parte menas invento,denunciante candidata 2015 declaro television ...,https://www.lavanguardia.com/local/paisvasco/2...,2021-11-03,caso joven 30 anos supuestamente agredida barr...
5341,1435569864974950400,abc_es,joven trans denuncia agresion parte hombre,lamenta escalada violencia colectivo lgtbi ins...,https://www.abc.es/espana/comunidad-valenciana...,2021-09-08,joven trans veintiun anos denunciado haber suf...
850,1455837894640390149,20m,cree falsa denuncia supuesta agresion cuatro j...,vicelehendakari consejero senalado miercoles,https://www.20minutos.es/noticia/4877189/0/la-...,2021-11-03,vicelehendakari consejero senalado miercoles e...
6843,1407043237240573954,elperiodico,nino ahora triunfa,personaje serie chico sexual avanza actor cant...,https://www.elperiodico.com/es/tele/20210619/m...,2021-06-19,netflix series programas television instituto ...
6632,1375820879649767424,elperiodico,frente drama 3 violacion multiple,cosas revolvian bastante estomago comenta actr...,https://www.elperiodico.com/es/tele/20220308/a...,2022-03-08,serie atresmedia series programas television a...
4330,1369400106227806217,larazon_es,traficante esclavos borracho columnista desata...,articulista tambien censurado normalizar cultu...,https://www.larazon.es/cultura/20210309/5hoqsw...,2021-03-09,revisionismo hizo arrancar prohibir menores 7 ...
3590,1489625954997927936,europapress,prision provisional presuntamente violar mujer...,not specified,https://www.europapress.es/catalunya/noticia-p...,2022-02-04,girona 4 europa press 6 enviado prision provis...
9431,1422675995312771080,el_pais,pide renuncia gobernador democrata tras inform...,concluye raiz investigacion independiente acci...,https://elpais.com/internacional/2021-08-03/el...,2021-08-03,gobernador peso pesado cuyo nombre sonado cons...
12194,1219616140919480322,COPE,hermanos evitaron violacion pensamos dos veces,contado cope reaccionaron ver joven trataba ag...,https://www.cope.es/emisoras/asturias/noticias...,2020-01-21,hermanos pensaron dos veces pasado domingo die...
1726,1509866552128155655,20m,juez pide imputar existir indicios solidos enc...,numero 15 pedido tsjcv impute vicepresidenta,https://www.20minutos.es/noticia/4979918/0/el-...,2022-04-01,numero 15 pedido viernes tsjcv impute vicepres...


In [10]:
#Construct pairs of the articles
T = [10, 15, 30, 60]
cases_df = pd.DataFrame(columns=['tweet_id_A', 'article_A', 'url_A', 'tweet_id_B', 'article_B', 'url_B', 'vmdistance'])
epsilon = 0.50

pairs_df = pd.DataFrame()

for row in k1_articles.iterrows():
    for timeframe in T:
        tweet_id = row[1]['tweet_id']
        title = row[1]['title']
        headline = row[1]['headline']
        media = row[1]['media']
        date = row[1]['publication_date']
        url = row[1]['url']
        article = row[1]['article']

        candidates_df = articles_df[articles_df['media'] != media]
        candidates_df = candidates_df[candidates_df['publication_date'] > date]
    
        candidates_df = candidates_df[candidates_df['publication_date'] < (date + timedelta(days=timeframe))]
        
        tpairs_df = pd.DataFrame({'tweet_id_A':tweet_id, 'media_A':media, 'article_A':(title+' '+headline), 
                                 'url_A':url, 'tweet_id_B':candidates_df['tweet_id'], 
                                 'media_B':candidates_df['media'], 
                                 'article_B':(candidates_df['title']+' '+candidates_df['headline']), 
                                 'url_B':candidates_df['url'], 't':abs((date - candidates_df['publication_date']).dt.days)})
        
        pairs_df = pd.concat([pairs_df, tpairs_df])

In [11]:
pairs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83536 entries, 773 to 14926
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_id_A  83536 non-null  object
 1   media_A     83536 non-null  object
 2   article_A   83536 non-null  object
 3   url_A       83536 non-null  object
 4   tweet_id_B  83536 non-null  object
 5   media_B     83536 non-null  object
 6   article_B   83536 non-null  object
 7   url_B       83536 non-null  object
 8   t           83536 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 6.4+ MB


In [12]:
# COMPUTE COSINE SIMILARITY OF TF-IDF FOR EACH PAIR OF ARTICLES
cosine_matrix = cosine_similarity_matrix(data, mapping_keys, pairs_df, mapping_tweets) 
pairs_df = add_similarity_column(pairs_df, mapping_keys, cosine_matrix, 'tf-idf')

---GETTING NEWS
---STEM
---COMPUTE TF-IDF
---COMPUTE COSINE SIMILARITY MATRIX


In [16]:
T = [10, 15, 30, 60]
epsilon_pairs_df = pd.DataFrame()

epsilon = 0.35
for row in k1_articles.iterrows():
    tweet_pairs = pairs_df[pairs_df['tweet_id_A']==row[1]['tweet_id']]
    for timeframe in T:
        tweet_pairs_t = tweet_pairs[tweet_pairs['tf-idf']<=epsilon]
        #tweet_pairs_t = tweet_pairs_t[tweet_pairs_t['tf-idf']<=epsilon]
        try:
            epsilon_pairs_df = pd.concat([epsilon_pairs_df,tweet_pairs_t.sample(n=2)])
        except:
            try:
                epsilon_pairs_df = pd.concat([epsilon_pairs_df,tweet_pairs_t.sample(n=1)])
            except:
                pass
        

In [18]:
epsilon_pairs_df['t'] = abs(epsilon_pairs_df['t'])

In [20]:
epsilon_pairs_df.head()

Unnamed: 0,tweet_id_A,media_A,article_A,url_A,tweet_id_B,media_B,article_B,url_B,t,tf-idf
8348,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313852460839165952,publico_es,nueve detenidos cuatro prision violacion menor...,https://www.publico.es/sociedad/pais-valencia-...,1,0.276703
8348,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313852460839165952,publico_es,nueve detenidos cuatro prision violacion menor...,https://www.publico.es/sociedad/pais-valencia-...,1,0.276703
14142,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313850324902346754,laSextaTV,noveno hombre violacion grupo nina lname arres...,https://www.lasexta.com/noticias/sociedad/dete...,1,0.29173
14142,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313850324902346754,laSextaTV,noveno hombre violacion grupo nina lname arres...,https://www.lasexta.com/noticias/sociedad/dete...,1,0.29173
4118,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313846788865044480,larazon_es,nueve detenidos violacion menor namename trata...,https://www.larazon.es/comunidad-valenciana/20...,1,0.296127


In [21]:
epsilon_pairs_df.to_csv('data/pairs_epsilon.csv')

# 3. Compute similarity

In [13]:
articles_df = articles_df.sort_values(by='publication_date')

In [14]:
t = 30
pairs_df = pd.DataFrame()
id_A = []
id_B = []
T = []
count = 0

for row in articles_df.iterrows():
    tweet_id_A = row[1]['tweet_id']
    date = row[1]['publication_date']
    date_df = articles_df[articles_df["publication_date"] >= (date)] 
    date_df = date_df[date_df["publication_date"] < (date + timedelta(days=t))] 
    for row_date in date_df.iterrows():
        tweet_id_B = row_date[1]['tweet_id']
        id_A.append(tweet_id_A)
        id_B.append(tweet_id_B)
        T.append(abs((date - row_date[1]['publication_date']).days))
        
    if count in [1000, 5000, 10000, 14000]:
        print(count)
        
    count += 1

1000
5000
10000
14000


In [15]:
pairs_df = pd.DataFrame({'tweet_id_A':id_A, 'tweet_id_B':id_B, 'days':T})

In [None]:
# COMPUTE COSINE SIMILARITY OF TF-IDF FOR EACH PAIR OF ARTICLES
cosine_matrix = cosine_similarity_matrix(data, mapping_keys, pairs_df, mapping_tweets) 
pairs_df = add_similarity_column(pairs_df, mapping_keys, cosine_matrix, 'tf-idf')

---GETTING NEWS
---STEM
---COMPUTE TF-IDF
---COMPUTE COSINE SIMILARITY MATRIX


In [None]:
pairs_df.head()

In [None]:
pairs_df.info()

In [None]:
cases_df = pairs_df[pairs_df['tf-idf']<0.35]

In [None]:
cases_df.info()

In [None]:
#Construct graph
sources = list(cases_df['tweet_id_A'].values)
targets = list(cases_df['tweet_id_B'].values)
G = nx.Graph()
G.add_nodes_from(sources)
G.add_nodes_from(targets)
for i in range(len(sources)):
    G.add_edge(sources[i], targets[i])

In [None]:
#Numerate connected components
cc_list = list(nx.connected_components(G))
cont = 0
cc_elements = []
cc_id = []
for cc in cc_list:
    for elem in cc:
        cc_elements.append(elem)
        cc_id.append(cont)
    cont += 1

In [None]:
#Save connected components
connected_components_df = pd.DataFrame({'cluster_id':cc_id, 'tweet_id':cc_elements})

In [None]:
connected_components_df.info()

In [None]:
connected_components_df.head()

In [None]:
connected_components_df.to_csv('data/cases_df.csv', index=False)

In [None]:
len(list(nx.connected_components(G)))