# Packages

In [12]:
import pandas as pd
import numpy as np
import itertools
import networkx as nx
import pyvis
import igraph as i
import matplotlib.pyplot as plt
from collections import Counter
import string
from tqdm import tqdm
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

In [13]:
df_articles = pd.read_csv('_raw/articles_scraped.csv')

In [14]:
def prepare_df_connections(iterable):
    value_combinations = list(itertools.combinations(iterable, 2))
    df_connections = pd.DataFrame(value_combinations, columns=['ind_1','ind_2'])
    df_connections['weight'] = 0
    df_connections.reset_index(drop=True, inplace=True)

    return df_connections

In [15]:
def network_preparation(df_connections,dict_sizes): # df_connections is a dataframe of weighted edges
    elist = list(zip(df_connections['ind_1'],df_connections['ind_2'],df_connections['weight']))
    g = nx.Graph()
    g.add_weighted_edges_from([(el[0],el[1],el[2]) for el in elist if el[2]!=0])

    nt = pyvis.network.Network('1500px','1500px', bgcolor="#222222", font_color="white") #,select_menu=True,filter_menu=True)
    nt.barnes_hut()
    nt.from_nx(g) 
    nt.toggle_physics(True)

    for node in nt.nodes:
        node['font']['size'] = 100
        node['shape'] = 'circularImage'
        node['image'] = f"/Users/federico.bindi/Desktop/GitHub/organized-crime-articles/_aux/pictures/{node['id']}.png"
        node['size'] = dict_sizes[node['id']]

    return g,nt

# Individuals

In [16]:
df_individuals = pd.read_csv('_raw/individuals.csv')

In [17]:
dict_individuals = dict(zip(df_individuals.name,df_individuals.search_term))

In [18]:
dict_sizes = {}
for name in df_individuals.name.values:
    dict_sizes[name] = 0
    for article in df_articles.text.values:
        if dict_individuals[name] in article:
            dict_sizes[name] += 1

## Network of mentions

### Preparation

In [19]:
df_connections = prepare_df_connections(df_individuals.name.values)

In [20]:
for article in tqdm(df_articles.text.values):
    for row in range(len(df_connections)):
        if dict_individuals[df_connections.loc[row,'ind_1']] in article and dict_individuals[df_connections.loc[row,'ind_2']] in article:
            df_connections.loc[row,'weight'] += 1

100%|██████████| 10786/10786 [03:09<00:00, 56.77it/s]


### Visualisation

In [21]:
g,nt = network_preparation(df_connections,dict_sizes)
nt.show('_out/networks/individuals.html',notebook=False)

_out/networks/individuals.html


### Validation

Pedro Oliverio Guerreiro and Henry de Jesús Lopez were both affiliated with the Centauros, an arm of the United Self-Defense Forces of Colombia (Autodefensas Unidas de Colombia - AUC).

There is a strong connection between Hector Manuel Beltran Leyva and Pedro Oliverio Guerreiro, which is not corroborated by my web search. The same happens for Mi Sangre and El Chapo.

## Network of alliances

### Preparation

In [22]:
df_connections2 = prepare_df_connections(df_individuals.name.values)

Theoretically, we can analyse articles from 4 points of view:
1. Keyword extraction
2. Co-occurrence analysis (how many times two bosses' names appear close in an article)
3. Sentiment analysis on the articles

Points 2 and 3 are probably not useful: two bosses will appear closely in an article also if they are rival, and the articles are very likely to receive a negative sentiment classification anyway.

In [23]:
'''alliance_keywords = [
    "alliance","allies", "partnership", "cooperation", "collaboration", "joint venture",
    "joint operation", "mutual support", "united front", "solidarity", "working together",
    "comradeship", "coalition", "teaming up", "combine forces", "coordinated effort",
    "combined operations", "coordinated strategy", "harmonious relationship", "allied forces",
    "mutual assistance"
]

rivalry_keywords = [
    "rivalry", "conflict", "competition", "feud", "hostility", "animosity", "enemies",
    "contention", "dispute", "antagonism", "clash", "enmity", "competition for territory",
    "power struggle", "bitter rivalry", "ongoing feud", "turf war", "opposing forces",
    "confrontational relationship", "longstanding dispute", "battle for supremacy"
]'''

# Alliance Keywords with Regex Variations
alliance_keywords = [
    r"alliance", r"partner(s|ship|ing)?", r"cooperation", r"collaboration",
    r"joint venture", r"mutual support", r"united front", r"solidarity",
    r"work(ing)? together", r"coalition", r"team(ing)? up", r"combine(d)? forces",
    r"coordinated effort", r"combined operations", r"coordinated strategy",
    r"harmonious relationship", r"allied forces", r"mutual assistance"
]

# Rivalry Keywords with Regex Variations
rivalry_keywords = [
    r"rivalry", r"conflict", r"competition", r"feud", r"hostility", r"animosity",
    r"contention", r"dispute", r"antagonism", r"clash", r"enmity",
    r"competition for territory", r"power struggle", r"bitter rivalry",
    r"ongoing feud", r"turf war", r"opposing forces",
    r"confrontational relationship", r"longstanding dispute",
    r"battle for supremacy"
]

In [24]:
type(re.search(r"partner(s|ship|ing)?","diocan"))

NoneType

In [25]:
def clean_text(text):
    # Create a translation table that maps punctuation characters to None
    translator = str.maketrans(string.punctuation, " " * len(string.punctuation))
    
    # Use the translate method to remove punctuation
    cleaned_text = text.lower().translate(translator)
    
    return cleaned_text

In [26]:
def count_words_in_text(keyword_list, text):
    
    counter = 0
    target_words = text.split()

    for keyword in keyword_list:
        matches = [1 if re.search(keyword,word) is not None else 0 for word in target_words]
        counter += sum(matches)
    
    return counter

In [27]:
def detect_relationship(text, entity_1, entity_2):

    if entity_1 in text and entity_2 in text:

        alliance_score = count_words_in_text(alliance_keywords,clean_text(text))
        rivalry_score = count_words_in_text(rivalry_keywords,clean_text(text))

        if alliance_score > rivalry_score:
            return 1
        else:
            return 0
        
    else:
        return 0

In [28]:
for article in tqdm(df_articles.text.values):
    for row in range(len(df_connections2)):
        df_connections2.loc[row,'weight'] += detect_relationship(article,dict_individuals[df_connections2.loc[row,'ind_1']],dict_individuals[df_connections2.loc[row,'ind_2']])

100%|██████████| 10786/10786 [35:40<00:00,  5.04it/s]


In [29]:
'''def update_weight(row, article, dict_individuals):
    ind_1 = df_connections2.loc[row, 'ind_1']
    ind_2 = df_connections2.loc[row, 'ind_2']
    weight_increment = detect_relationship(article, dict_individuals[ind_1], dict_individuals[ind_2])
    return weight_increment

# Create a ThreadPoolExecutor with a specified number of threads
max_threads = 8  # Adjust the number of threads as needed
with ThreadPoolExecutor(max_threads) as executor:
    for article in tqdm(df_articles.text.values):
        futures = []
        for row in range(len(df_connections2)):
            future = executor.submit(update_weight, row, article, dict_individuals)
            futures.append(future)
        for row, future in enumerate(futures):
            weight_increment = future.result()
            df_connections2.loc[row, 'weight'] += weight_increment'''


"def update_weight(row, article, dict_individuals):\n    ind_1 = df_connections2.loc[row, 'ind_1']\n    ind_2 = df_connections2.loc[row, 'ind_2']\n    weight_increment = detect_relationship(article, dict_individuals[ind_1], dict_individuals[ind_2])\n    return weight_increment\n\n# Create a ThreadPoolExecutor with a specified number of threads\nmax_threads = 8  # Adjust the number of threads as needed\nwith ThreadPoolExecutor(max_threads) as executor:\n    for article in tqdm(df_articles.text.values):\n        futures = []\n        for row in range(len(df_connections2)):\n            future = executor.submit(update_weight, row, article, dict_individuals)\n            futures.append(future)\n        for row, future in enumerate(futures):\n            weight_increment = future.result()\n            df_connections2.loc[row, 'weight'] += weight_increment"

In [30]:
g2, nt2 = network_preparation(df_connections2,dict_sizes)
nt2.show('_out/networks/individuals_alliances.html',notebook=False)

_out/networks/individuals_alliances.html


# Groups