# Packages

In [52]:
import pandas as pd
import numpy as np
import itertools
import networkx as nx
import pyvis
import igraph as i
import matplotlib.pyplot as plt
from collections import Counter
import string
from tqdm import tqdm
import re
from cdlib import algorithms
from concurrent.futures import ThreadPoolExecutor

In [53]:
df_articles = pd.read_csv('_raw/articles_scraped.csv')

In [54]:
def prepare_df_connections(iterable):
    value_combinations = list(itertools.combinations(iterable, 2))
    df_connections = pd.DataFrame(value_combinations, columns=['ind_1','ind_2'])
    df_connections['weight'] = 0
    df_connections.reset_index(drop=True, inplace=True)

    return df_connections

In [55]:
def network_preparation(df_connections,dict_sizes): # df_connections is a dataframe of weighted edges
    elist = list(zip(df_connections['ind_1'],df_connections['ind_2'],df_connections['weight']))
    g = nx.Graph()
    g.add_weighted_edges_from([(el[0],el[1],el[2]) for el in elist if el[2]!=0])

    nt = pyvis.network.Network('1500px','1500px', bgcolor="#222222", font_color="white") #,select_menu=True,filter_menu=True)
    nt.barnes_hut()
    nt.from_nx(g) 
    nt.toggle_physics(True)

    for node in nt.nodes:
        node['font']['size'] = 100
        node['shape'] = 'circularImage'
        node['image'] = f"/Users/federico.bindi/Desktop/GitHub/organized-crime-articles/_aux/pictures/{node['id']}.png"
        node['size'] = dict_sizes[node['id']]

    return g,nt

# Individuals

In [56]:
df_individuals = pd.read_csv('_raw/individuals.csv')

In [57]:
dict_individuals = dict(zip(df_individuals.name,df_individuals.search_term))

In [58]:
dict_sizes = {}
for name in df_individuals.name.values:
    dict_sizes[name] = 0
    for article in df_articles.text.values:
        if dict_individuals[name] in article:
            dict_sizes[name] += 1

In [59]:
sorted_dict = dict(sorted(dict_sizes.items(), key=lambda item: item[1], reverse=True))

sorted_dict

{'Pablo Escobar': 308,
 'Hector Manuel Beltran Leyva': 267,
 'Joaquín Guzmán Loera': 117,
 'Heriberto Lazcano': 87,
 'Nemesio Oseguera Cervantes': 68,
 'Victor Navarro': 62,
 'Vicente Carrillo Fuentes': 54,
 'Walid Makled': 47,
 'Dairo Antonio Úsuga': 46,
 'Diego Perez Henao': 45,
 'José Adán Salazar Umaña': 41,
 'Miguel Botache Santillana': 41,
 'Nazario Moreno Gonzalez': 40,
 'Horst Walther Overdick': 38,
 'Luciano Marín Arango': 38,
 'Álex Saab': 34,
 'Luis Enrique Calle Serna': 34,
 'Juan Alberto Ortiz Lopez': 33,
 'Ismael Zambada García': 33,
 'Antonio Cardenas Guillen': 31,
 'Servando Gomez Martinez': 30,
 'Maximiliano Bonilla Orozco': 30,
 'Ariel Máximo Cantero': 27,
 'César Emilio Peralta': 24,
 'Fausto Isidro Meza Flores': 22,
 'Rodrigo Londoño Echeverri': 21,
 'Javier Antonio Calle Serna': 21,
 'Erickson Vargas Cardenas': 19,
 'Walter Patricio Arizala': 18,
 'Guillermo Leon Saenz Vargas': 18,
 'Genaro García Luna': 17,
 'Gustavo Aníbal Giraldo': 17,
 'Géner García Molina': 17

### Preparation

In [60]:
df_connections = prepare_df_connections(df_individuals.name.values)

Theoretically, we can analyse articles from 4 points of view:
1. Keyword extraction
2. Co-occurrence analysis (how many times two bosses' names appear close in an article)
3. Sentiment analysis on the articles

Points 2 and 3 are probably not useful: two bosses will appear closely in an article also if they are rival, and the articles are very likely to receive a negative sentiment classification anyway.

In [61]:
# Alliance Keywords with Regex Variations
alliance_keywords = [
    r"alliance", r"partner(s|ship|ing)?", r"cooperation", r"collaboration",
    r"joint venture", r"mutual support", r"united front", r"solidarity",
    r"work(ing)? together", r"coalition", r"team(ing)? up", r"combine(d)? forces",
    r"coordinated effort", r"combined operations", r"coordinated strategy",
    r"harmonious relationship", r"allied forces", r"mutual assistance"
]

# Rivalry Keywords with Regex Variations
rivalry_keywords = [
    r"rivalry", r"conflict", r"competition", r"feud", r"hostility", r"animosity",
    r"contention", r"dispute", r"antagonism", r"clash", r"enmity",
    r"competition for territory", r"power struggle", r"bitter rivalry",
    r"ongoing feud", r"turf war", r"opposing forces",
    r"confrontational relationship", r"longstanding dispute",
    r"battle for supremacy"
]

In [62]:
def count_words_in_text(keyword_list, text):
    
    counter = 0
    target_words = text.split()

    for keyword in keyword_list:
        matches = [1 if re.search(keyword,word) is not None else 0 for word in target_words]
        counter += sum(matches)
    
    return counter

In [63]:
def clean_text(text):
    # Create a translation table that maps punctuation characters to None
    translator = str.maketrans(string.punctuation, " " * len(string.punctuation))
    
    # Use the translate method to remove punctuation
    cleaned_text = text.lower().translate(translator)
    
    return cleaned_text

In [64]:
def detect_relationship(text, entity_1, alias_1, entity_2, alias_2):

    if entity_1 in text and alias_1 in text and entity_2 in text and alias_2 in text:

        alliance_score = count_words_in_text(alliance_keywords,clean_text(text))
        rivalry_score = count_words_in_text(rivalry_keywords,clean_text(text))

        if alliance_score > rivalry_score:
            return 1
        else:
            return 0
        
    else:
        return 0

In [65]:
for article in tqdm(df_articles.text.values):
    for row in range(len(df_connections)):

        entity_1 = dict_individuals[df_connections.loc[row,'ind_1']]
        alias_1 = df_individuals.loc[df_individuals.name == df_connections.loc[row,'ind_1'],'alias_1'].iloc[0]
        if alias_1 is np.nan: # in case the individual does not have a nickname
            alias_1 = ''

        entity_2 = dict_individuals[df_connections.loc[row,'ind_2']]
        alias_2 = df_individuals.loc[df_individuals.name == df_connections.loc[row,'ind_2'],'alias_1'].iloc[0]
        if alias_2 is np.nan:
            alias_2 = ''

        df_connections.loc[row,'weight'] += detect_relationship(article, entity_1, alias_1, entity_2, alias_2)

df_connections.to_csv('_aux/Connections between individuals - alliances.csv')

100%|██████████| 10798/10798 [2:30:56<00:00,  1.19it/s] 


In [66]:
g, nt = network_preparation(df_connections,dict_sizes)
nt.show('_out/networks/individuals_alliances.html',notebook=False)

_out/networks/individuals_alliances.html


### Community detection (Louvain algorithm)

In [67]:
partition = algorithms.louvain(g, resolution=1., randomize=False)
print(partition.communities) # identified clustering

for community,color_code in zip(self.partition.communities):            
    self.add_node_attr(data=self.color_dict,attr_name='color')

    for node in self.nt.nodes:
                node_key = node[key]
                try:
                    node[attr_name] = data[node_key]
                except:
                    pass 

[['Pablo Escobar', 'Ariel Máximo Cantero', 'Diego Perez Henao', 'José Bayron Piedrahita Ceballos', 'Óscar Mauricio Pachón', 'Martin Farfan Diaz Gonzalez', 'Victor Navarro', 'Walid Makled', 'Diego Fernando Murillo Bejarano', 'Reynerio Flores Lazo', 'Pedro Oliverio Guerrero Castillo', 'Maximiliano Bonilla Orozco', 'Antonio Cardenas Guillen', 'Daniel Barrera Barrera', 'Luis Enrique Calle Serna', 'Javier Antonio Calle Serna'], ['Genaro García Luna', 'Dámaso López Núñez', 'Joaquín Guzmán Loera', 'Nemesio Oseguera Cervantes', 'Ismael Zambada García', 'José Adán Salazar Umaña', "Juan Antonio 'Tony' Hernández Alvarado", 'Luis Agustin Caicedo Velandia', 'Gustavo Aníbal Giraldo', 'Nicolás Rodríguez Bautista', 'Vicente Carrillo Fuentes', 'Hector Manuel Beltran Leyva'], ['Luciano Marín Arango', 'Ricardo Abel Ayala Orrego', 'Henry Castellanos Garzón', 'Seuxis Pausías Hernández', 'Hernán Darío Velásquez Saldarriaga', 'Miguel Botache Santillana', 'Néstor Gregorio Vera Fernández', 'José Vicente Lesmes

NameError: name 'self' is not defined

### Preferential attachment mechanism

It would make sense to think that criminals prefer to form connections with other criminals that already have lots of connections (because they can offer more power and connections). This would be reflected in the degree distribution following a power law.

### Validation

Pedro Oliverio Guerreiro and Henry de Jesús Lopez were both affiliated with the Centauros, an arm of the United Self-Defense Forces of Colombia (Autodefensas Unidas de Colombia - AUC).

There is a strong connection between Hector Manuel Beltran Leyva and Pedro Oliverio Guerreiro, which is not corroborated by my web search. The same happens for Mi Sangre and El Chapo.

In [68]:
df_articles[(df_articles.text.str.contains('Guerrero')) & (df_articles.text.str.contains('Beltran Leyva'))].url

1737     https://insightcrime.org/news/analysis/morelos...
1965     https://insightcrime.org/news/analysis/gamecha...
2094     https://insightcrime.org/news/analysis/ayotzin...
3005     https://insightcrime.org/news/analysis/narco-m...
3342     https://insightcrime.org/news/brief/mexico-vig...
                               ...                        
9850     https://insightcrime.org/news/analysis/poverty...
10237    https://insightcrime.org/news/analysis/insight...
10460    https://insightcrime.org/news/analysis/insight...
10570    https://insightcrime.org/news/analysis/2010-tr...
10778    https://insightcrime.org/news/analysis/last-mo...
Name: url, Length: 74, dtype: object

In [69]:
df_connections.sort_values('weight',ascending=False).head(30)

Unnamed: 0,ind_1,ind_2,weight
2626,Nemesio Oseguera Cervantes,Joaquín Guzmán Loera,7
3910,Diego Perez Henao,Luis Enrique Calle Serna,6
3811,Joaquín Guzmán Loera,Ismael Zambada García,6
658,Pablo Escobar,Maximiliano Bonilla Orozco,5
2265,Miguel Botache Santillana,Néstor Gregorio Vera Fernández,4
3777,Maximiliano Bonilla Orozco,Luis Enrique Calle Serna,4
3776,Maximiliano Bonilla Orozco,Diego Perez Henao,4
649,Pablo Escobar,Dairo Antonio Úsuga,3
661,Pablo Escobar,Joaquín Guzmán Loera,3
2303,Miguel Botache Santillana,Luciano Marín Arango,3


# Groups