In [1]:
import warnings
warnings.filterwarnings('ignore')
from pyemd import emd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,  precision_recall_curve
from gensim.models.keyedvectors import KeyedVectors
from nltk.stem.snowball import SpanishStemmer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from datetime import datetime, timedelta
import lxml.etree as ET
import seaborn as sns
import pandas as pd
import numpy as np
import regex as re
import itertools
import unidecode
import spacy
import html
import os
import networkx as nx

In [2]:
NEWS_PATH = 'data/news/'
FASTTEXT_W2V_PATH = 'utilities/embeddings-l-model.vec'

In [3]:
def listdir_checked(path, unwanted = ['.DS_Store']):
    '''
    Discard unwanted files or directories when listing the elements in a given path
    '''
    return (f for f in os.listdir(path) if f not in unwanted)


def normalize_string(to_normalize, encoded = False):
    '''
    Normalize text given a string
    '''
    text = str(to_normalize).lower()  # lowering text
    if encoded: 
        text = ' '.join([html.unescape(term) for term in text.split()])
    text = unidecode.unidecode(text)

    text = re.sub(r'[^\w\s]', '', text)  # removing all the punctuations
    last_text = text.split()  # tokenize the text

    # remove stopwords
    stopwords_set = set(stopwords.words("spanish"))
    stopwords_set = stopwords_set.union(set(["name"]))
    
    last_text = ' '.join([x for x in last_text if (x not in stopwords_set)])
    return last_text


def classify_entities(doc):
    '''
    Given an nlp doc, returns:
    * A dictonary with the entities grouped by type: 'PER', 'LOC', 'ORG' and 'MISC'
    * A list with all the entities
    '''
    classif_dict = {}
    classif_list = []
    for ent in doc.ents:
        try:
            classif_list.append(ent)
            classif_dict[ent.label_].append(ent)
        except:
            classif_dict[ent.label_] = [ent]
    # Check that the dict has all the keys
    for key in ['PER', 'LOC', 'ORG', 'MISC']:
        if key not in classif_dict.keys():
            classif_dict[key] = None

    return classif_dict, classif_list


def create_articles_dictionary(NEWS_PATH):
    '''
    Import articles information.
    Articles are stored in directories in the NEWS_PATH.
    '''
    data = {}               # keys: media, value: list of dictionaries with info about the news articles of the given media
    unique_urls = []        # list to store unique urls to discard repeated ones
    repeated_data = {}      # store repeated articles following the same format as 'data' dictionary


    for year in listdir_checked(NEWS_PATH):
            for month in listdir_checked(NEWS_PATH + '/' + year):
                    for file in listdir_checked(NEWS_PATH + '/' + year + '/' + month):
                        try:
                            full_path = NEWS_PATH + '/' + year + '/' + month + '/' + file
                            # Read xml file - info stored following NewsML-G2 format
                            root = ET.parse(full_path).getroot()
                            # Parse news
                            media = file.rsplit('_', 1)[0]
                            # Check repeated urls
                            url = root.findall(".//infoSource")[0].get("uri")
                            str_date = root.findall('.//contentMeta')[0].find('contentCreated').text[:10]
                            info = {
                                'id': file.split(':')[-1].replace('.xml', ''),
                                'media': media,
                                'publication_date': datetime.strptime(str_date, '%Y-%m-%d'),
                                'title': normalize_string(root.findall('.//itemRef')[0].find('title').text, encoded = True),
                                'headline': normalize_string(root.findall(".//itemRef")[0].find('description').text.strip(), encoded = True),
                                'article': normalize_string(root.findall('.//itemRef')[1].find('description').text.strip(), encoded = True),                'url': url
                            }

                            if url not in unique_urls:
                                unique_urls.append(url)
                                try:
                                    data[media].append(info)
                                except:
                                    data[media] = [info]

                            else:
                                try:
                                    repeated_data[media].append(info)
                                except:
                                    repeated_data[media] = [info]
                        except:
                            print(file)

    return data, repeated_data


def load_elements(data):
    # EXTRACT ENTITIES
    '''
    Entities will be stored two arrays:
    * entities - all entities
    * summary_entities - entities from the title and head (summary)
    Auxiliary variables:
    * mapping_keys - dict with key: tweet id -> value: absolute position in entities and summary_entities
    '''
    mapping_keys = {}  # key: tweet id -> value: absolute position in all_entities
    mapping_tweets = {}  # key: value -> tweet_id: absolute position in all_entities
    counter = 0

    summary_entities = []
    article_entities = []

    nlp = spacy.load("es_core_news_sm")

    for media, new in data.items():
        print('------ MEDIA:', media)
        for element in new:
            mapping_keys[element['id']] = counter
            mapping_tweets[counter] = element['id']
            # Get entities from each part of the new
            try:
                title_doc = nlp(element['title'])
                headline_doc = nlp(element['headline'])
                article_doc = nlp(element['article'])
            except:
                print(f"Problem extracting entities from article identified with tweet_id = {element['id']}")
                pass

            # Classify entities
            title_dict, title_list = classify_entities(title_doc)
            headline_dict, headline_list = classify_entities(headline_doc)
            article_dict, article_list = classify_entities(article_doc)

            # Add entities into data
            element['title_entities'] = title_dict
            element['headline_entities'] = headline_dict
            element['article_entities'] = article_dict

            # Store into entities and summary_entities array
            summary_entities.append(title_list + headline_list)
            article_entities.append(article_list)

            counter += 1
            
    return summary_entities, article_entities, mapping_keys, mapping_tweets


def wmdistance(data, mapping_keys, pairs_df, mapping_tweets):
    N = len(mapping_keys)
    articles = [None] * N
    
    for media, new in data.items():
        for element in new:
            try:
                pos = mapping_keys[element['id']]
                articles[pos] = normalize_string(element['title'])+' '+normalize_string(element['headline'].split('.')[0])
            except:
                pass
    
    wordvectors_fasttext_file = FASTTEXT_W2V_PATH
    wordvectors = KeyedVectors.load_word2vec_format(wordvectors_fasttext_file)
    wmd_matrix = np.zeros(shape=(N,N))
    
    for row in pairs_df.iterrows():
        
        id_A = row[1]['tweet_id_A']
        id_B = row[1]['tweet_id_B']
        
        i = mapping_keys[str(id_A)]
        j = mapping_keys[str(id_B)]

        distance = wordvectors.wmdistance(articles[i], articles[j])
        wmd_matrix[i,j] = distance
        wmd_matrix[j,i] = distance
    
    return wmd_matrix


def cosine_similarity_matrix(data, mapping_keys, pairs_df, mapping_tweets):
    # GET NEWS ARTICLES
    print('---GETTING NEWS')
    N = len(mapping_keys)
    articles = [None] * N
    for media, new in data.items():
        for element in new:
            try:
                pos = mapping_keys[element['id']]
                
                articles[pos] = normalize_string(element['title'] + ' ' + 
                                                 element['headline'] + ' ' + 
                                                 element['article'])
            except:
                pass

    # COMPUTE TF-IDF
    # Stem
    print('---STEM')
    stemmer = SpanishStemmer(ignore_stopwords=False)
    for i, article in enumerate(articles):
        try:
            articles[i] = str([stemmer.stem(word) for word in article.split()])
        except:
            articles[i] = 'null'
            pass
        
    # Compute tf-idf
    print('---COMPUTE TF-IDF')  
    stopwords_spanish = [word.encode().decode('utf-8') for word in stopwords.words('spanish')] # Remove stopwords
    vectorizer = TfidfVectorizer(stop_words=stopwords_spanish)
    X = vectorizer.fit_transform(articles)

    # COMPUTE COSINE SIMILARITY MATRIX
    print('---COMPUTE COSINE SIMILARITY MATRIX')
    cosine_sim_matrix = np.zeros(shape=(N, N))
    for row in pairs_df.iterrows():
        
        id_A = row[1]['tweet_id_A']
        id_B = row[1]['tweet_id_B']
        
        try:
            i = mapping_keys[str(id_A)]
            j = mapping_keys[str(id_B)]
            
            similarity = cosine_similarity(X[i], X[j])[0][0]
            cosine_sim_matrix[i, j] = 1-similarity
            cosine_sim_matrix[j, i] = 1-similarity
        
        except:
            print('ERROR WITH: ', id_A, id_B)
            
    return cosine_sim_matrix


def add_similarity_column(pairs_df, mapping_keys, similarity_matrix, column_name):
    '''
    Arguments:
     * pairs_df
         - pd.DataFrame
         - contains pairs of tweet_ids --> column names: [tweet_id_A, tweet_id_B]
     * mapping_keys
         - dictionary
         - key: tweet_id, value: position
     * similarity_matrix
         - np.matrix
         - symmetrical matrix with the similarity between tweets
     * column_name
         - string
         - name of the new column to be added in pairs_df
    '''
    similarity_pairs = []               # Create list with the same order as pairs_df
    for i, row in pairs_df.iterrows():
        tid_A = row['tweet_id_A']       # Obtain tweet id
        tid_B = row['tweet_id_B']       # Get the position of each article in the matrix
        pos_A = mapping_keys[tid_A]
        pos_B = mapping_keys[tid_B]
        similarity_pairs.append(similarity_matrix[pos_A, pos_B])             # Order similarity following pairs_df order
    pairs_df.insert(len(pairs_df.columns), column_name, similarity_pairs)   # Add new column
    return pairs_df

def componentes_conexas(G):
    def visita(v): #subrutina
        cc[v] = componente_actual 
        for u in G[v]:
            if u not in cc: #u no visitado <=> u no tiene componente
                visita(u)
    cc = dict() #Etiqueta de componente conexa
    componente_actual = 0
    for v in G:
        if v not in cc:
            visita(v)
            componente_actual += 1
    return cc

# 1. Load Data

In [4]:
data, repeated_data = create_articles_dictionary(NEWS_PATH)

el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1466063287485878280.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1475242375102480387.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1470254067817627656.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1471080345189638144.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1471395805697302530.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1468135693566238723.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1466586222609022988.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1476569001337761798.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1469945355534049284.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1469394348651466752.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1466147794167730184.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1472879722539454466.xml
el_pais_urn:newsml:Ediciones EL PAÍS S.L.:20220606:1470381993296683015.xml
el_pais_urn:newsml:Edicio

In [5]:
articles_df = pd.DataFrame()

for media in data.keys():
    for new in data[media]:
        tweet_id = new['id']
        title = normalize_string(new['title'])
        headline = normalize_string(new['headline'])
        url = new['url']
        publication_date = new['publication_date']
        article = normalize_string(new['article'])
        
        articles_df = articles_df.append({'tweet_id': tweet_id, 'media': media, 'title': title, 
                                          'headline': headline, 'url': url, 'publication_date': publication_date,
                                          'article': article},
                                       ignore_index=True)

In [6]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14961 entries, 0 to 14960
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   tweet_id          14961 non-null  object        
 1   media             14961 non-null  object        
 2   title             14961 non-null  object        
 3   headline          14961 non-null  object        
 4   url               14961 non-null  object        
 5   publication_date  14961 non-null  datetime64[ns]
 6   article           14961 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 818.3+ KB


In [7]:
summary_entities, article_entities, mapping_keys, mapping_tweets = load_elements(data)

------ MEDIA: 20m
------ MEDIA: okdiario
------ MEDIA: LaVanguardia
------ MEDIA: europapress
------ MEDIA: larazon_es
------ MEDIA: abc_es
------ MEDIA: elperiodico
------ MEDIA: elmundoes
------ MEDIA: publico_es
------ MEDIA: rtve
------ MEDIA: el_pais
------ MEDIA: La_SER
------ MEDIA: informativost5
------ MEDIA: ElHuffPost
------ MEDIA: voz_populi
------ MEDIA: COPE
------ MEDIA: elconfidencial
------ MEDIA: elespanolcom
------ MEDIA: eldiarioes
------ MEDIA: laSextaTV
------ MEDIA: OndaCero_es
------ MEDIA: noticias_cuatro
------ MEDIA: libertaddigital
------ MEDIA: EFEnoticias_ES


# 2. Determine parameters epsilon and T

In [8]:
#Select 40 random articles
k1_articles = articles_df.sample(n=40)
k1_articles

Unnamed: 0,tweet_id,media,title,headline,url,publication_date,article
11968,1313566194964799490,voz_populi,septimo implicado violacion grupal menor,fuentes instituto armado hombre sido detenido ...,https://www.vozpopuli.com/actualidad/detenido-...,2020-10-06,detenido hombre 32 anos presunta implicacion v...
9084,1217143087912443906,el_pais,recurre archivo causa culpar falsamente magreb...,juez sobreseyo delito odio agresion mujer play...,https://elpais.com/ccaa/2020/01/14/valencia/15...,2020-01-14,seccion delitos recurrido archivo investigacio...
993,1367886408946487297,20m,anos carcel militar pederasta abusar grabar ex...,militar detenido marco sido condenado siete an...,https://www.20minutos.es/noticia/4609407/0/sie...,2021-03-05,militar detenido marco sido condenado siete an...
8899,1501915717909635072,rtve,justicia francesa mantiene acusacion violacion...,sala instruccion considera existen indicios gr...,https://www.rtve.es/noticias/20220310/justicia...,2022-03-10,confirmo jueves acusacion violacion agresion s...
10650,1288776599458869248,informativost5,mujer forzada prostituirse 11 anos,saldar deuda grupo criminal trajo personas sid...,https://www.telecinco.es/informativos/sociedad...,2020-07-30,liberado municipio mujer forzada prostituirse ...
4639,1493533049199415296,larazon_es,denuncia acoso sufrio redes sociales tras,actor director mostro mensajes odio homofobo s...,https://www.larazon.es/gente/famosos/20220214/...,2022-02-14,encargados abrir ansiada alfombra roja pasado ...
13742,1431149484264890370,eldiarioes,alumnas colegio concertado denuncian redes soc...,centro anuncia investigacion interna tras narr...,https://www.eldiario.es/galicia/antiguas-alumn...,2021-08-26,vez ser premiado vigues distinguido deberia se...
224,1235086320059334658,20m,cada 20 jovenes 15 19 anos sido violada denuncia,millones cada veinte jovenes 15 19 anos vive m...,https://www.20minutos.es/noticia/4174351/0/una...,2020-03-04,millones cada veinte jovenes 15 19 anos vive m...
14073,1287630047675908097,laSextaTV,menor acusado causar cortes brazo intentar apu...,agredido presentaba rostro ensangrentado vario...,https://www.lasexta.com/noticias/sociedad/dete...,2020-07-26,local detenido menor edad presunto autor asest...
11583,1442701583247101961,ElHuffPost,cantante r declarado culpable trafico sexual c...,personas testificado artista conocido exitos i...,https://www.huffingtonpost.es/entry/el-cantant...,2021-09-28,cantante estadounidense r sido declarado lunes...


In [9]:
wordvectors_fasttext_file = FASTTEXT_W2V_PATH
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_fasttext_file)

In [10]:
#Construct pairs of the articles
T = [10, 15, 30, 60]
cases_df = pd.DataFrame(columns=['tweet_id_A', 'article_A', 'url_A', 'tweet_id_B', 'article_B', 'url_B', 'vmdistance'])
epsilon = 0.50

pairs_df = pd.DataFrame()

for row in k1_articles.iterrows():
    for timeframe in T:
        tweet_id = row[1]['tweet_id']
        title = row[1]['title']
        headline = row[1]['headline']
        media = row[1]['media']
        date = row[1]['publication_date']
        url = row[1]['url']
        article = row[1]['article']

        candidates_df = articles_df[articles_df['media'] != media]
        candidates_df = candidates_df[candidates_df['publication_date'] > date]
    
        candidates_df = candidates_df[candidates_df['publication_date'] < (date + timedelta(days=timeframe))]
        
        tpairs_df = pd.DataFrame({'tweet_id_A':tweet_id, 'media_A':media, 'article_A':(title+' '+headline), 
                                 'url_A':url, 'tweet_id_B':candidates_df['tweet_id'], 
                                 'media_B':candidates_df['media'], 
                                 'article_B':(candidates_df['title']+' '+candidates_df['headline']), 
                                 'url_B':candidates_df['url'], 't':abs((date - candidates_df['publication_date']).dt.days)})
        
        pairs_df = pd.concat([pairs_df, tpairs_df])

In [11]:
pairs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67564 entries, 581 to 14878
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_id_A  67564 non-null  object
 1   media_A     67564 non-null  object
 2   article_A   67564 non-null  object
 3   url_A       67564 non-null  object
 4   tweet_id_B  67564 non-null  object
 5   media_B     67564 non-null  object
 6   article_B   67564 non-null  object
 7   url_B       67564 non-null  object
 8   t           67564 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 5.2+ MB


In [13]:
# COMPUTE COSINE SIMILARITY OF TF-IDF FOR EACH PAIR OF ARTICLES
cosine_matrix = cosine_similarity_matrix(data, mapping_keys, pairs_df, mapping_tweets) 
pairs_df = add_similarity_column(pairs_df, mapping_keys, cosine_matrix, 'tf-idf')

---GETTING NEWS
---STEM
---COMPUTE TF-IDF
---COMPUTE COSINE SIMILARITY MATRIX


In [14]:
pairs_df.to_csv('data/pairs.csv', index=False)

In [15]:
pairs_df.head()

Unnamed: 0,tweet_id_A,media_A,article_A,url_A,tweet_id_B,media_B,article_B,url_B,t,tf-idf
581,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1314285692734836739,20m,santero abusar sexualmente menor santero 61 an...,https://www.20minutos.es/noticia/4411174/0/det...,2,0.889681
584,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1314459911636627456,20m,acusado nueva violacion exproductor cine acusa...,https://www.20minutos.es/noticia/4411260/0/har...,3,0.936409
594,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1316141161946517509,20m,campana busca extorsionar falsos videos conten...,https://www.20minutos.es/noticia/4415697/0/ale...,7,0.989851
599,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1316323661813604352,20m,dos jovenes agredir sexualmente companera trab...,https://www.20minutos.es/noticia/4416191/0/det...,8,0.922552
603,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1314473937544372225,20m,definitivamente denuncia abusos padre vinculad...,https://www.20minutos.es/noticia/4411362/0/arc...,3,0.925586


In [16]:
T = [10, 15, 30, 60]
epsilon_pairs_df = pd.DataFrame()

epsilon = 0.35
for row in k1_articles.iterrows():
    tweet_pairs = pairs_df[pairs_df['tweet_id_A']==row[1]['tweet_id']]
    for timeframe in T:
        tweet_pairs_t = tweet_pairs[tweet_pairs['tf-idf']<=epsilon]
        #tweet_pairs_t = tweet_pairs_t[tweet_pairs_t['tf-idf']<=epsilon]
        try:
            epsilon_pairs_df = pd.concat([epsilon_pairs_df,tweet_pairs_t.sample(n=2)])
        except:
            try:
                epsilon_pairs_df = pd.concat([epsilon_pairs_df,tweet_pairs_t.sample(n=1)])
            except:
                pass
        

In [17]:
epsilon_pairs_df.head()

Unnamed: 0,tweet_id_A,media_A,article_A,url_A,tweet_id_B,media_B,article_B,url_B,t,tf-idf
8348,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313852460839165952,publico_es,nueve detenidos cuatro prision violacion menor...,https://www.publico.es/sociedad/pais-valencia-...,1,0.276703
8348,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313852460839165952,publico_es,nueve detenidos cuatro prision violacion menor...,https://www.publico.es/sociedad/pais-valencia-...,1,0.276703
14142,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313850324902346754,laSextaTV,noveno hombre violacion grupo nina lname arres...,https://www.lasexta.com/noticias/sociedad/dete...,1,0.29173
14142,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313850324902346754,laSextaTV,noveno hombre violacion grupo nina lname arres...,https://www.lasexta.com/noticias/sociedad/dete...,1,0.29173
4118,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313846788865044480,larazon_es,nueve detenidos violacion menor namename trata...,https://www.larazon.es/comunidad-valenciana/20...,1,0.296127


In [18]:
epsilon_pairs_df['t'] = abs(epsilon_pairs_df['t'])

In [19]:
epsilon_pairs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 8348 to 11149
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tweet_id_A  24 non-null     object 
 1   media_A     24 non-null     object 
 2   article_A   24 non-null     object 
 3   url_A       24 non-null     object 
 4   tweet_id_B  24 non-null     object 
 5   media_B     24 non-null     object 
 6   article_B   24 non-null     object 
 7   url_B       24 non-null     object 
 8   t           24 non-null     int64  
 9   tf-idf      24 non-null     float64
dtypes: float64(1), int64(1), object(8)
memory usage: 2.1+ KB


In [20]:
epsilon_pairs_df.head()

Unnamed: 0,tweet_id_A,media_A,article_A,url_A,tweet_id_B,media_B,article_B,url_B,t,tf-idf
8348,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313852460839165952,publico_es,nueve detenidos cuatro prision violacion menor...,https://www.publico.es/sociedad/pais-valencia-...,1,0.276703
8348,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313852460839165952,publico_es,nueve detenidos cuatro prision violacion menor...,https://www.publico.es/sociedad/pais-valencia-...,1,0.276703
14142,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313850324902346754,laSextaTV,noveno hombre violacion grupo nina lname arres...,https://www.lasexta.com/noticias/sociedad/dete...,1,0.29173
14142,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313850324902346754,laSextaTV,noveno hombre violacion grupo nina lname arres...,https://www.lasexta.com/noticias/sociedad/dete...,1,0.29173
4118,1313566194964799490,voz_populi,septimo implicado violacion grupal menor fuent...,https://www.vozpopuli.com/actualidad/detenido-...,1313846788865044480,larazon_es,nueve detenidos violacion menor namename trata...,https://www.larazon.es/comunidad-valenciana/20...,1,0.296127


In [21]:
epsilon_pairs_df.to_csv('data/pairs_epsilon.csv')

# 3. Compute similarity

In [22]:
articles_df = articles_df.sort_values(by='publication_date')

In [23]:
t = 30
pairs_df = pd.DataFrame()
id_A = []
id_B = []
T = []
count = 0

for row in articles_df.iterrows():
    tweet_id_A = row[1]['tweet_id']
    date = row[1]['publication_date']
    date_df = articles_df[articles_df["publication_date"] >= (date)] 
    date_df = date_df[date_df["publication_date"] < (date + timedelta(days=t))] 
    for row_date in date_df.iterrows():
        tweet_id_B = row_date[1]['tweet_id']
        id_A.append(tweet_id_A)
        id_B.append(tweet_id_B)
        T.append(abs((date - row_date[1]['publication_date']).days))
        
    if count in [1000, 5000, 10000, 14000]:
        print(count)
        
    count += 1

1000
5000
10000
14000


In [24]:
pairs_df = pd.DataFrame({'tweet_id_A':id_A, 'tweet_id_B':id_B, 'days':T})

In [None]:
# COMPUTE COSINE SIMILARITY OF TF-IDF FOR EACH PAIR OF ARTICLES
cosine_matrix = cosine_similarity_matrix(data, mapping_keys, pairs_df, mapping_tweets) 
pairs_df = add_similarity_column(pairs_df, mapping_keys, cosine_matrix, 'tf-idf')

---GETTING NEWS
---STEM
---COMPUTE TF-IDF
---COMPUTE COSINE SIMILARITY MATRIX
ERROR WITH:  1231420920851505153 1231578557568208896


In [None]:
pairs_df.head()

In [None]:
pairs_df.info()

In [None]:
cases_df = pairs_df[pairs_df['tf-idf']<0.35]

In [None]:
cases_df.info()

In [None]:
sources = list(cases_df['tweet_id_A'].values)
targets = list(cases_df['tweet_id_B'].values)
G = nx.Graph()
G.add_nodes_from(sources)
G.add_nodes_from(targets)
for i in range(len(sources)):
    G.add_edge(sources[i], targets[i])

In [None]:
cc_list = list(nx.connected_components(G))
cont = 0
cc_elements = []
cc_id = []
for cc in cc_list:
    for elem in cc:
        cc_elements.append(elem)
        cc_id.append(cont)
    cont += 1

In [None]:
connected_components_df = pd.DataFrame({'cluster_id':cc_id, 'tweet_id':cc_elements})

In [None]:
connected_components_df.info()

In [None]:
connected_components_df.head()

In [None]:
connected_components_df.to_csv('data/cases_df.csv', index=False)

In [None]:
len(list(nx.connected_components(G)))