# Packages

In [1]:
import pandas as pd
import requests
import bs4
import urllib
import time
import re
from tqdm import tqdm

# Articles

## Scraping

In [2]:
base_url_articles = 'https://insightcrime.org/news/page/{}/'
pages_articles = 899
articles_data = {'title':[],'url':[],'tag':[],'date_author':[],'text':[]}

In [3]:
for page in tqdm(range(1,pages_articles+1)):
    
    url = base_url_articles.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    boxes = soup.select('.page-general__list-news .card-news h2 a')

    for box in boxes:
        articles_data['title'].append(box.get_text(strip=True))

        article_link = box['href']
        articles_data['url'].append(article_link)

        article_response = requests.get(article_link)
        article_soup = bs4.BeautifulSoup(article_response.content, 'html.parser')
        article_content = article_soup.find('div', class_='single-content')

        try:
            articles_data['text'].append(article_content.get_text())
        except:
            articles_data['text'].append('NOT FOUND')

        try:
            articles_data['date_author'].append(article_soup.find('span',class_='autor').get_text())
        except:
            articles_data['date_author'].append('NOT FOUND')

        try:
            articles_data['tag'].append(article_soup.find('div',class_='date').find('a').get_text())
        except:
            articles_data['tag'].append('NOT FOUND')

  0%|          | 0/899 [00:00<?, ?it/s]

100%|██████████| 899/899 [2:53:11<00:00, 11.56s/it]  


In [4]:
df_articles = pd.DataFrame(articles_data)

## Cleaning and export

In [11]:
df_articles = df_articles[df_articles.text!='NOT FOUND'] # Article on Knights Templar mistakenly inserted in the 'News' section

df_articles[['date','author']] = df_articles['date_author'].str.split('BY',n=1,expand=True)
df_articles.drop(columns=['date_author'],inplace=True)

pattern = r"\nSEE ALSO: .*?\n"
df_articles['text'] = df_articles['text'].apply(lambda x: re.sub(pattern, "\n", x))

for col in ['text','date','author']:
    df_articles[col] = df_articles[col].str.replace("\n", "")

df_articles['date'] = pd.to_datetime(df_articles['date'].str.strip(), format='%d %b %Y')

df_articles.drop_duplicates(inplace=True,ignore_index=True)

In [12]:
df_articles

Unnamed: 0,title,url,tag,text,date,author
0,"Firearms, Disappearances, Prison Overcrowding:...",https://insightcrime.org/news/firearms-disappe...,ARMS TRAFFICKING,"From overpopulated, crumbling prisons to tens ...",2023-08-01,CHRIS DALBY
1,Uruguay's Top Trafficker Disappears Yet Again ...,https://insightcrime.org/news/uruguay-traffick...,URUGUAY,Uruguayan drug trafficker Sebastián Marset has...,2023-07-31,CHRISTOPHER NEWTON AND CHRIS DALBY
2,Latin America’s Criminal Bankers: Explaining C...,https://insightcrime.org/news/latin-america-cr...,COLOMBIA,Three cases in recent weeks highlight how Colo...,2023-07-28,ANASTASIA AUSTIN
3,Criminal Groups Ally With State Forces for Cat...,https://insightcrime.org/news/criminal-groups-...,VENEZUELA,An increase in cattle rustling across Venezuel...,2023-07-27,VENEZUELA INVESTIGATIVE UNIT
4,"Shark Fin Trade Thriving in Latin America, Des...",https://insightcrime.org/news/shark-fin-trade-...,ILLEGAL FISHING,With record shark fin seizures continuing acro...,2023-07-26,MARÍA FERNANDA RAMÍREZ AND CHRIS DALBY
...,...,...,...,...,...,...
10777,"20,000 Migrants Kidnapped per year in Mexico",https://insightcrime.org/news/analysis/20000-m...,HUMAN TRAFFICKING,Twenty thousand migrants per year are kidnappe...,2010-10-31,INSIGHT CRIME
10778,Colombia Government: 152 Gangs in Medellin,https://insightcrime.org/news/analysis/colombi...,COLOMBIA,Colombian authorities have indentified 152 gan...,2010-10-31,INSIGHT CRIME
10779,"Uruguay's Security Minister: ""Feudalization"" o...",https://insightcrime.org/news/analysis/uruguay...,BRAZIL,"In an academic forum in Montevideo, Uruguay’s ...",2010-10-31,INSIGHT CRIME
10780,17 Dead in Honduras Gang Massacre,https://insightcrime.org/news/analysis/17-dead...,HONDURAS,Honduras is seeing the fallout from Tuesday’s ...,2010-09-09,INSIGHT CRIME


In [13]:
df_articles.to_csv('_raw/articles_scraped.csv',index=False)

# Groups

In [None]:
base_url_groups = 'https://insightcrime.org/criminal-actors/page/{}/?filter=armed_groups&country=0&orderby'
pages_groups = 9
groups_list = []

In [None]:
for page in range(1,pages_groups+1):
    
    url = base_url_groups.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    boxes = soup.select('.page-general__news-list-criminals .box h2 a')
    page_list = [box.get_text(strip=True) for box in boxes]
    groups_list += page_list

# Individuals

## Scraping

In [None]:
base_url_individuals = 'https://insightcrime.org/criminal-actors/page/{}/?filter=personalities&country=0&orderby'
pages_individuals = 8
individuals_list = []

In [None]:
for page in range(1,pages_individuals+1):
    
    url = base_url_individuals.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    boxes = soup.select('.page-general__news-list-criminals .box h2 a')
    page_list = [box.get_text(strip=True) for box in boxes]
    individuals_list += page_list

## Cleaning and export

In [None]:
individuals_list = [name.split(',') for name in individuals_list]
individuals_list = [[sublist[0], sublist[1].replace(' alias ', '').replace("'",'').replace('‘', '').replace('’', ''), sublist[2].replace(' or the ','').replace("'",'')] if len(sublist) == 3 
              else [sublist[0], sublist[1].replace(' alias ', '').replace("'",'').replace('‘', '').replace('’', ''), ''] if len(sublist) == 2
              else [sublist[0], '', '']
              for sublist in individuals_list]

In [None]:
df_individuals = pd.DataFrame(individuals_list).rename(columns={0:'name',1:'alias_1',2:'alias_2'})

In [None]:
dict_corr_names = {'Miguel Angel Treviño':'Miguel Angel Treviño Morales',
                  'Diego Fernando Murillo':'Diego Fernando Murillo Bejarano',
                  'Luis E. Calle Serna':'Luis Enrique Calle Serna',
                  'Hector Beltran Leyva':'Hector Manuel Beltran Leyva'}

for name, correction in dict_corr_names.items():
    df_individuals.loc[df_individuals.name==name,'name'] = correction

In [None]:
def get_first_name(name):
    if len(name.split())==4:
        first_name = name.split()[0] + ' ' + name.split()[1]
    else:
        first_name = name.split()[0]
    return first_name

def get_last_name(name):
    if len(name.split())<=2:
        last_name = name.split()[-1]
    else:
        last_name = name.split()[-2] + ' ' + name.split()[-1]
    return last_name

In [None]:
df_individuals['first_name'] = df_individuals['name'].apply(get_first_name)
df_individuals['family_name'] = df_individuals['name'].apply(get_last_name)

We necessarily need to complement with visual inspection and make some corrections:

In [None]:
dict_corr_first_family_names = {'César Emilio Peralta':['César Emilio','Peralta'],
             'Jobanis de Jesús Ávila Villadiego':['Jobanis de Jesús','Ávila Villadiego'],
             'Leider Johani Noscue':['Leider Johani','Noscue'],
             'Walter Patricio Arizala':['Walter Patricio','Arizala'],
             'Henry de Jesús López':['Henry de Jesús','López'],
             'Horst Walther Overdick':['Horst Walther','Overdick'],
             'Dairo Antonio Úsuga':['Dairo Antonio','Úsuga'],
             'Juan de Dios Úsuga':['Juan de Dios','Úsuga'],
             'Juan Orlando Hernández':['Juan Orlando','Hernández'],
             'Ariel Máximo Cantero':['Ariel Máximo','Cantero'],
             'Pedro Oliverio Guerrero':['Pedro Oliverio','Guerrero']}

In [None]:
for name, corrections in dict_corr_first_family_names.items():
    df_individuals.loc[df_individuals.name==name,'first_name'] = corrections[0]
    df_individuals.loc[df_individuals.name==name,'family_name'] = corrections[1]

"Rastrojos" is actually a group and Juan Orlando Hernández was the president of Honduras

In [None]:
df_individuals = df_individuals[(df_individuals.name != 'Rastrojos') & (df_individuals.name != 'Juan Orlando Hernández')]

In [None]:
df_individuals.to_csv('_raw/Individuals.csv',index=False)