# Packages

In [1]:
import pandas as pd
import numpy as np
import requests
import bs4
import urllib
import time
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Articles

## Scraping

In [25]:
base_url_articles = 'https://insightcrime.org/news/page/{}/'
pages_articles = 900
articles_data = {'title':[],'url':[],'tag':[],'date_author':[],'text':[]}

In [26]:
def process_page(page):
    url = base_url_articles.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    
    boxes = soup.select('.page-general__list-news .card-news h2 a')
    
    results = []

    for box in boxes:

        article_data = {}

        article_data['title'] = box.get_text(strip=True)

        article_link = box['href']
        article_data['url'] = article_link

        article_response = requests.get(article_link)
        article_soup = bs4.BeautifulSoup(article_response.content, 'html.parser')
        article_content = article_soup.find('div', class_='single-content')

        try:
            for quote in article_content.find_all("blockquote", class_="wp-block-quote"):
                quote.extract()
            for quote in article_content.find_all("h2", class_="wp-block-heading"):
                quote.extract()
            for quote in article_content.find_all("figcaption", class_="wp-element-caption"):
                quote.extract()
            for quote in article_content.find_all("h4", class_="wp-block-heading"):
                quote.extract()

            article_data['text'] = article_content.get_text()
        except:
            article_data['text'] = 'NOT FOUND'

        try:
            article_data['date_author'] = article_soup.find('span', class_='autor').get_text()
        except:
            article_data['date_author'] = 'NOT FOUND'

        try:
            article_data['tag'] = article_soup.find('div', class_='date').find('a').get_text()
        except:
            article_data['tag'] = 'NOT FOUND'

        results.append(article_data)

    return results

with ThreadPoolExecutor() as executor:
    all_results = list(tqdm(executor.map(process_page, range(1, pages_articles + 1)), total=pages_articles))

for result_set in all_results:
    for article_data in result_set:
        articles_data['title'].append(article_data['title'])
        articles_data['url'].append(article_data['url'])
        articles_data['text'].append(article_data['text'])
        articles_data['date_author'].append(article_data['date_author'])
        articles_data['tag'].append(article_data['tag'])

  0%|          | 0/899 [00:00<?, ?it/s]

100%|██████████| 899/899 [27:57<00:00,  1.87s/it]  


In [28]:
df_articles = pd.DataFrame(articles_data)

## Cleaning and export

In [29]:
df_articles = df_articles[df_articles.text!='NOT FOUND'] # Article on Knights Templar mistakenly inserted in the 'News' section

df_articles[['date','author']] = df_articles['date_author'].str.split('BY',n=1,expand=True)
df_articles.drop(columns=['date_author'],inplace=True)

for col in ['text','date','author']:
    df_articles[col] = df_articles[col].str.replace("\n", " ")
    df_articles[col] = df_articles[col].str.replace("\xa0", " ")

df_articles['date'] = pd.to_datetime(df_articles['date'].str.strip(), format='%d %b %Y')

df_articles.drop_duplicates(inplace=True,ignore_index=True)

In [31]:
df_articles.to_csv('_raw/articles_scraped.csv',index=False)

# Groups

## Scraping

In [66]:
base_url_groups = 'https://insightcrime.org/criminal-actors/page/{}/?filter=armed_groups&country=0&orderby'
pages_groups = 9
groups_list = []

In [67]:
for page in tqdm(range(1,pages_groups+1)):
    
    url = base_url_groups.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    boxes = soup.select('.page-general__news-list-criminals .box h2 a')
    page_list = [box.get_text(strip=True) for box in boxes]
    groups_list += page_list

100%|██████████| 9/9 [00:14<00:00,  1.62s/it]


In [68]:
df_groups = pd.DataFrame(groups_list).rename(columns={0:'name'})

## Cleaning and export

In [69]:
df_groups.drop_duplicates(inplace=True,ignore_index=True)

df_groups['alias'] = ''
df_groups.loc[df_groups.name=='Jalisco Cartel New Generation (CJNG)','alias'] = 'CJNG'
df_groups.loc[df_groups.name=='Family of the North - FDN','alias'] = 'FDN'
df_groups.loc[df_groups.name=='First Capital Command - PCC','alias'] = 'PCC'
df_groups.loc[df_groups.name=='Gaitanistas - Gulf Clan','alias'] = 'AGC'
df_groups.loc[df_groups.name=='FBL/FPLN','alias'] = 'FPLN'

df_groups = df_groups[(df_groups.name != 'Ex-FARC Mafia') & (df_groups.name != 'Ex-FARC Mafia in Venezuela')] # We have different ex-FARC groups, we don't want the supergroup to be a separate entity
df_groups = df_groups[df_groups.name != 'BACRIM in Venezuela'] # 'BACRIM' is a general term to indicate criminal organisations in Colombia
df_groups = df_groups[df_groups.name != 'ELN in Venezuela'] # We already have ELN
df_groups = df_groups[df_groups.name != 'FARC 57th Front in Panama'] # We already have FARC
df_groups['super_group'] = df_groups.name.apply(lambda text: 'Ex-FARC Mafia' if 'Ex-FARC Mafia' in text else '')

df_groups.name = df_groups.name.str.replace(' - Ex-FARC Mafia','')
df_groups.name = df_groups.name.str.replace('(Ex-FARC Mafia)','',regex=False)

df_groups.reset_index(inplace=True,drop=True)

In [70]:
dict_corr_names = {'Jalisco Cartel New Generation (CJNG)':'Jalisco Cartel New Generation',
                  'Family of the North - FDN':'Family of the North',
                  'First Capital Command - PCC':'PCC',
                  'Gaitanistas - Gulf Clan':'Gulf Clan',
                  'FBL/FPLN':'FBL'}

for name, correction in dict_corr_names.items():
    df_groups.loc[df_groups.name==name,'name'] = correction

In [71]:
df_groups.head(60)
#df_groups.tail(23)

Unnamed: 0,name,alias,super_group
0,Central General Staff,,Ex-FARC Mafia
1,Acacio Medina Front,,Ex-FARC Mafia
2,Las Claritas Sindicato,,
3,Carlos Capa Gang,,
4,The R Organization,,
5,Tren de Guayana,,
6,Lobos,,
7,10th Front,,Ex-FARC Mafia
8,Pachenca,,
9,Bala na Cara,,


In [72]:
df_groups.to_csv('_raw/groups.csv')

# Individuals

## Scraping

In [34]:
base_url_individuals = 'https://insightcrime.org/criminal-actors/page/{}/?filter=personalities&country=0&orderby'
pages_individuals = 8
individuals_list = []

In [35]:
for page in tqdm(range(1,pages_individuals+1)):
    
    url = base_url_individuals.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    boxes = soup.select('.page-general__news-list-criminals .box h2 a')
    page_list = [box.get_text(strip=True) for box in boxes]
    individuals_list += page_list

## Cleaning and export

In [36]:
individuals_list = [name.split(',') for name in individuals_list]
individuals_list = [[sublist[0], sublist[1].replace(' alias ', '').replace("'",'').replace('‘', '').replace('’', ''), sublist[2].replace(' or the ','').replace("'",'')] if len(sublist) == 3 
              else [sublist[0], sublist[1].replace(' alias ', '').replace("'",'').replace('‘', '').replace('’', ''), ''] if len(sublist) == 2
              else [sublist[0], '', '']
              for sublist in individuals_list]

In [37]:
df_individuals = pd.DataFrame(individuals_list).rename(columns={0:'name',1:'alias_1',2:'alias_2'})

In [38]:
dict_corr_names = {'Miguel Angel Treviño':'Miguel Angel Treviño Morales',
                  'Diego Fernando Murillo':'Diego Fernando Murillo Bejarano',
                  'Luis E. Calle Serna':'Luis Enrique Calle Serna',
                  'Hector Beltran Leyva':'Hector Manuel Beltran Leyva'}

for name, correction in dict_corr_names.items():
    df_individuals.loc[df_individuals.name==name,'name'] = correction

In [39]:
def get_first_name(name):
    if len(name.split())==4:
        first_name = name.split()[0] + ' ' + name.split()[1]
    else:
        first_name = name.split()[0]
    return first_name

def get_last_name(name):
    if len(name.split())<=2:
        last_name = name.split()[-1]
    else:
        last_name = name.split()[-2] + ' ' + name.split()[-1]
    return last_name

In [40]:
df_individuals['first_name'] = df_individuals['name'].apply(get_first_name)
df_individuals['family_name'] = df_individuals['name'].apply(get_last_name)

We necessarily need to complement with visual inspection and make some corrections:

In [41]:
dict_corr_first_family_names = {'César Emilio Peralta':['César Emilio','Peralta'],
             'Jobanis de Jesús Ávila Villadiego':['Jobanis de Jesús','Ávila Villadiego'],
             'Leider Johani Noscue':['Leider Johani','Noscue'],
             'Walter Patricio Arizala':['Walter Patricio','Arizala'],
             'Henry de Jesús López':['Henry de Jesús','López'],
             'Horst Walther Overdick':['Horst Walther','Overdick'],
             'Dairo Antonio Úsuga':['Dairo Antonio','Úsuga'],
             'Juan de Dios Úsuga':['Juan de Dios','Úsuga'],
             'Juan Orlando Hernández':['Juan Orlando','Hernández'],
             'Ariel Máximo Cantero':['Ariel Máximo','Cantero'],
             'Pedro Oliverio Guerrero':['Pedro Oliverio','Guerrero']}

In [42]:
for name, corrections in dict_corr_first_family_names.items():
    df_individuals.loc[df_individuals.name==name,'first_name'] = corrections[0]
    df_individuals.loc[df_individuals.name==name,'family_name'] = corrections[1]

"Rastrojos" is actually a group and Juan Orlando Hernández was the president of Honduras

In [43]:
df_individuals = df_individuals[(df_individuals.name != 'Rastrojos') & (df_individuals.name != 'Juan Orlando Hernández')]

Let's create search terms for individuals. "Úsuga" and "Calle Serna" are duplicated family names (2 copies each). Both the two "Úsuga" and the two "Calle Serna" are brothers. Hence, for them the search term will be the whole name, while for other criminals it will just be the family name.

In [None]:
df_individuals['search_term'] = np.where((df_individuals.family_name=="Úsuga") | (df_individuals.family_name=="Calle Serna"),df_individuals.first_name+' '+df_individuals.family_name,df_individuals.family_name)

In [44]:
df_individuals.to_csv('_raw/Individuals.csv',index=False)