# Packages

In [1]:
import pandas as pd
import requests
import bs4
import urllib
import time
import re
from tqdm import tqdm

# Articles

In [2]:
base_url_articles = 'https://insightcrime.org/news/page/{}/'
pages_articles = 899
articles_data = {'title':[],'tag':[],'date_author':[],'text':[]}

In [3]:
for page in tqdm(range(1,pages_articles+1)):
    
    url = base_url_articles.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    boxes = soup.select('.page-general__list-news .card-news h2 a')

    for box in boxes:
        articles_data['title'].append(box.get_text(strip=True))

        article_link = box['href']  
        article_response = requests.get(article_link)
        article_soup = bs4.BeautifulSoup(article_response.content, 'html.parser')
        article_content = article_soup.find('div', class_='single-content')
        articles_data['text'].append(article_content.get_text())

        articles_data['date_author'].append(article_soup.find('span',class_='autor').get_text())

        articles_data['tag'].append(article_soup.find('div',class_='date').find('a').get_text())

  3%|▎         | 29/899 [04:56<2:56:14, 12.15s/it]

In [None]:
df_articles = pd.DataFrame(articles_data)

In [None]:
df_articles[['date','author']] = df_articles['date_author'].str.split('BY',n=1,expand=True)
df_articles.drop(columns=['date_author'],inplace=True)

pattern = r"\nSEE ALSO: .*?\n"
df_articles['text'] = df_articles['text'].apply(lambda x: re.sub(pattern, "\n", x))

for col in ['text','date','author']:
    df_articles[col] = df_articles[col].str.replace("\n", "")

df_articles['date'] = pd.to_datetime(df_articles['date'].str.strip(), format='%d %b %Y')

In [None]:
print(df_articles.loc[0,'text'])

Uruguayan drug trafficker Sebastián Marset has gone on the run after narrowly evading capture in Santa Cruz, Bolivia, showcasing yet again his ability to hide behind official protection and fake identities. Nearly 2,500 police officers were dispatched in raids at Marset's luxury mansion in Santa Cruz and seven other properties connected to him. Marset and his wife and children escaped in a white Land Cruiser after he was likely forewarned that a raid was imminent.  Authorities found a small arsenal of assault weapons and ammunition, luxury vehicles, and exotic animals, including monkeys and ocelots.Marset appears to have obtained a Bolivian passport under the name Gabriel de Souza Beumer, but it is unknown if this was an official or fake document. He had even bought a second-division football team in Santa Cruz. Marset's Bolivian passport. Source: Bolivian Attorney-General's OfficeMarset has links to political elites in several countries and has used such connections to avoid trouble i

In [None]:
df_articles

Unnamed: 0,title,tag,text,date,author
0,Uruguay's Top Trafficker Disappears Yet Again ...,URUGUAY,Uruguayan drug trafficker Sebastián Marset has...,2023-07-31,CHRISTOPHER NEWTON AND CHRIS DALBY
1,Latin America’s Criminal Bankers: Explaining C...,COLOMBIA,Three cases in recent weeks highlight how Colo...,2023-07-28,ANASTASIA AUSTIN
2,Criminal Groups Ally With State Forces for Cat...,VENEZUELA,An increase in cattle rustling across Venezuel...,2023-07-27,VENEZUELA INVESTIGATIVE UNIT
3,"Shark Fin Trade Thriving in Latin America, Des...",ILLEGAL FISHING,With record shark fin seizures continuing acro...,2023-07-26,MARÍA FERNANDA RAMÍREZ AND CHRIS DALBY
4,Q&A: Voices Opposing Mano Dura Policies in Lat...,EL SALVADOR,"The long-term consequences of mano dura, or ir...",2023-07-25,GAVIN VOSS
5,Monagas: The Missing Link for Venezuela Drug T...,VENEZUELA,Venezuela’s northeastern state of Monagas has ...,2023-07-24,VENEZUELA INVESTIGATIVE UNIT
6,Fraud Groups Use Deepfakes to Enhance Imitatio...,CYBERCRIME,Fraudsters are targeting Peruvians using deepf...,2023-07-21,GAVIN VOSS
7,"The Rise and Fall of the Choneros, Ecuador’s D...",CHONEROS,Jorge Bismarck Véliz España was an ambitious m...,2023-07-20,CHRIS DALBY
8,Ex-President Sentenced as Panama Makes Progres...,PANAMA,Former Panamanian president Ricardo Martinelli...,2023-07-19,GAVIN VOSS
9,Kidnappings Surge in Colombia Amid ELN Peace N...,COLOMBIA,"Kidnappings are rising across Colombia, despit...",2023-07-18,HENRY SHULDINER


In [None]:
df_articles.to_csv('_raw/articles_scraped.csv',index=False)

# Groups

In [None]:
base_url_groups = 'https://insightcrime.org/criminal-actors/page/{}/?filter=armed_groups&country=0&orderby'
pages_groups = 9
groups_list = []

In [None]:
for page in range(1,pages_groups+1):
    
    url = base_url_groups.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    boxes = soup.select('.page-general__news-list-criminals .box h2 a')
    page_list = [box.get_text(strip=True) for box in boxes]
    groups_list += page_list

# Individuals

## Scraping

In [None]:
base_url_individuals = 'https://insightcrime.org/criminal-actors/page/{}/?filter=personalities&country=0&orderby'
pages_individuals = 8
individuals_list = []

In [None]:
for page in range(1,pages_individuals+1):
    
    url = base_url_individuals.format(page)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    boxes = soup.select('.page-general__news-list-criminals .box h2 a')
    page_list = [box.get_text(strip=True) for box in boxes]
    individuals_list += page_list

## Cleaning and export

In [None]:
individuals_list = [name.split(',') for name in individuals_list]
individuals_list = [[sublist[0], sublist[1].replace(' alias ', '').replace("'",'').replace('‘', '').replace('’', ''), sublist[2].replace(' or the ','').replace("'",'')] if len(sublist) == 3 
              else [sublist[0], sublist[1].replace(' alias ', '').replace("'",'').replace('‘', '').replace('’', ''), ''] if len(sublist) == 2
              else [sublist[0], '', '']
              for sublist in individuals_list]

In [None]:
df_individuals = pd.DataFrame(individuals_list).rename(columns={0:'name',1:'alias_1',2:'alias_2'})

In [None]:
dict_corr_names = {'Miguel Angel Treviño':'Miguel Angel Treviño Morales',
                  'Diego Fernando Murillo':'Diego Fernando Murillo Bejarano',
                  'Luis E. Calle Serna':'Luis Enrique Calle Serna',
                  'Hector Beltran Leyva':'Hector Manuel Beltran Leyva'}

for name, correction in dict_corr_names.items():
    df_individuals.loc[df_individuals.name==name,'name'] = correction

In [None]:
def get_first_name(name):
    if len(name.split())==4:
        first_name = name.split()[0] + ' ' + name.split()[1]
    else:
        first_name = name.split()[0]
    return first_name

def get_last_name(name):
    if len(name.split())<=2:
        last_name = name.split()[-1]
    else:
        last_name = name.split()[-2] + ' ' + name.split()[-1]
    return last_name

In [None]:
df_individuals['first_name'] = df_individuals['name'].apply(get_first_name)
df_individuals['family_name'] = df_individuals['name'].apply(get_last_name)

We necessarily need to complement with visual inspection and make some corrections:

In [None]:
dict_corr_first_family_names = {'César Emilio Peralta':['César Emilio','Peralta'],
             'Jobanis de Jesús Ávila Villadiego':['Jobanis de Jesús','Ávila Villadiego'],
             'Leider Johani Noscue':['Leider Johani','Noscue'],
             'Walter Patricio Arizala':['Walter Patricio','Arizala'],
             'Henry de Jesús López':['Henry de Jesús','López'],
             'Horst Walther Overdick':['Horst Walther','Overdick'],
             'Dairo Antonio Úsuga':['Dairo Antonio','Úsuga'],
             'Juan de Dios Úsuga':['Juan de Dios','Úsuga'],
             'Juan Orlando Hernández':['Juan Orlando','Hernández'],
             'Ariel Máximo Cantero':['Ariel Máximo','Cantero'],
             'Pedro Oliverio Guerrero':['Pedro Oliverio','Guerrero']}

In [None]:
for name, corrections in dict_corr_first_family_names.items():
    df_individuals.loc[df_individuals.name==name,'first_name'] = corrections[0]
    df_individuals.loc[df_individuals.name==name,'family_name'] = corrections[1]

"Rastrojos" is actually a group and Juan Orlando Hernández was the president of Honduras

In [None]:
df_individuals = df_individuals[(df_individuals.name != 'Rastrojos') & (df_individuals.name != 'Juan Orlando Hernández')]

In [None]:
df_individuals.to_csv('_raw/Individuals.csv',index=False)