In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
import math
import os

In [4]:
df = pd.read_csv('Final-data/final_final.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762 entries, 0 to 761
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              762 non-null    object 
 1   Nicknames         427 non-null    object 
 2   Country           762 non-null    object 
 3   Proven_victims    762 non-null    float64
 4   Possible_victims  762 non-null    float64
 5   Years_active      760 non-null    float64
 6   Start_year        762 non-null    float64
 7   End_year          760 non-null    float64
 8   Conviction        439 non-null    object 
 9   Criminal_Penalty  511 non-null    object 
 10  Notes             759 non-null    object 
 11  Weapon            762 non-null    object 
 12  Gender            762 non-null    object 
 13  Weapon_Category   762 non-null    object 
dtypes: float64(5), object(9)
memory usage: 83.5+ KB


In [5]:
#Creamos la lista de names
names = df['Name'].dropna().unique().tolist()


In [None]:
#hacemos un web scraping de la api de wikipedia con el listado de los nombres de los asesinos y una serie de palabras clave
crime_keywords = [
    "abduction", "arson", "beheading", "bestiality", "burglary", "cannibalism", "castration", "child abuse", "dismemberment",
    "domestic violence", "hate crime", "incest", "kidnapping", "molestation", "murder", "necrophilia", "pedophilia", "rape",
    "robbery", "sexual abuse", "sodomy", "serial killer", "stalking", "strangulation", "terrorism", "torture"]

def get_crimes_from_wikipedia_api(name):
    try:
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": name,
            "format": "json"}
        response = requests.get(search_url, params=params)
        data = response.json()

        if not data['query']['search']:
            return "Not Found"

        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        page = requests.get(page_url)
        if page.status_code != 200:
            return "Page Error"

        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([p.get_text() for p in paragraphs[:25]])

        found_crimes = [kw for kw in crime_keywords if re.search(rf"\b{kw}\b", content.lower())]
        return ', '.join(found_crimes) if found_crimes else "murder"
    
    except Exception:
        return "Error"
#Para evitar que se colapse wikipedia o que nos bloquee, hacemos las busquedas por lotes que uniremos cuando los tengamos todos
batch_size = 100
num_batches = math.ceil(len(names) / batch_size)
batch_folder = "Primary-data/batches_crimes"
os.makedirs(batch_folder, exist_ok=True)

for i in range(num_batches):
    start = i * batch_size
    end = start + batch_size
    batch_names = names[start:end]
    results = []

    for name in batch_names:
        crimes = get_crimes_from_wikipedia_api(name)
        results.append({'Name': name, 'Crimes': crimes})
        sleep(0.5)

    batch_df = pd.DataFrame(results)
    batch_df.to_csv(f"{batch_folder}/crimes_batch_{i+1}.csv", index=False)

all_crime_batches = []
for file in sorted(os.listdir(batch_folder)):
    if file.endswith(".csv"):
        batch_path = os.path.join(batch_folder, file)
        all_crime_batches.append(pd.read_csv(batch_path))

df_crimes = pd.concat(all_crime_batches, ignore_index=True)

In [23]:
sentence_keywords_map = {
    "death penalty": "Death penalty", "execution": "Execution", "life imprisonment": "Life imprisonment",
    "life sentence": "Life imprisonment", "multiple life sentences": "Multiple life sentences",
    "capital punishment": "Death penalty", "parole": "Parole", "released": "Released", "commuted": "Sentence commuted",
    "incarcerated": "Incarcerated", "prison": "Incarcerated", "sentenced to": "Sentenced",

    "unidentified": "Not identified", "not identified": "Not identified", "never caught": "Not identified",
    "unknown killer": "Not identified", "unknown offender": "Not identified",

    "committed suicide": "Died before sentence", "killed himself": "Died before sentence",
    "died before trial": "Died before sentence", "died in custody": "Died before sentence",
    "found dead": "Died before sentence", "shot himself": "Died before sentence"
}

def get_sentence_from_wikipedia_api(name):
    try:
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": name,
            "format": "json"}
        response = requests.get(search_url, params=params)
        data = response.json()

        if not data['query']['search']:
            return "Not Found"

        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        page = requests.get(page_url)
        if page.status_code != 200:
            return "Page Error"

        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([p.get_text() for p in paragraphs[:25]]).lower()

        found = set()
        for keyword, label in sentence_keywords_map.items():
            if re.search(rf"\b{re.escape(keyword)}\b", content):
                found.add(label)

        return ', '.join(found) if found else "Unknown"
    
    except Exception as e:
        print(f"❌ Error con {name}: {e}")
        return "Error"

#Procesamos y guardamos por lotes para evitar que la api se enfade y nos bloquee
batch_size = 100
num_batches = math.ceil(len(names) / batch_size)
batch_folder = "Primary-data/batches_sentence"
os.makedirs(batch_folder, exist_ok=True)

for i in range(num_batches):
    start = i * batch_size
    end = start + batch_size
    batch_names = names[start:end]
    results = []

    for name in batch_names:
        penalty = get_sentence_from_wikipedia_api(name)
        results.append({'Name': name, 'Sentence': penalty})  # usamos "Sentence" directamente

        sleep(0.5)

    batch_df = pd.DataFrame(results)
    batch_df.to_csv(f"{batch_folder}/sentence_batch_{i+1}.csv", index=False)

#unimos todos los batches


In [24]:
# Unir todos los batches de sentencias
all_sentence_batches = []
for file in sorted(os.listdir("Primary-data/batches_sentence")):
    if file.endswith(".csv"):
        batch_path = os.path.join("Primary-data/batches_sentence", file)
        all_sentence_batches.append(pd.read_csv(batch_path))

df_sentence = pd.concat(all_sentence_batches, ignore_index=True)


In [25]:
#unimos los dos df que hemos sacado de las busqueda
df_trial = pd.merge(df_crimes, df_sentence, on="Name", how="outer")
df_trial.info()
#y revisamos nulos tras el merge
print(df_trial.isna().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762 entries, 0 to 761
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      762 non-null    object
 1   Crimes    762 non-null    object
 2   Sentence  762 non-null    object
dtypes: object(3)
memory usage: 18.0+ KB
Name        0
Crimes      0
Sentence    0
dtype: int64


In [26]:
print("Valores únicos en 'Crimes':")
print(df_trial['Crimes'].unique())


Valores únicos en 'Crimes':
['murder, robbery, serial killer' 'serial killer'
 'murder, rape, serial killer' 'murder' 'murder, serial killer'
 'murder, serial killer, strangulation' 'rape, serial killer'
 'murder, serial killer, torture'
 'kidnapping, murder, rape, robbery, serial killer'
 'robbery, serial killer' 'murder, rape, robbery, serial killer'
 'kidnapping, murder, serial killer, torture'
 'cannibalism, murder, rape, serial killer'
 'burglary, murder, serial killer' 'serial killer, torture' 'murder, rape'
 'murder, rape, serial killer, strangulation' 'kidnapping, serial killer'
 'murder, robbery, serial killer, torture'
 'arson, molestation, murder, serial killer' 'robbery'
 'murder, rape, sexual abuse, serial killer'
 'molestation, rape, serial killer' 'murder, rape, serial killer, torture'
 'murder, rape, robbery, serial killer, strangulation'
 'kidnapping, murder, rape, serial killer, strangulation' 'sexual abuse'
 'abduction, dismemberment, murder, rape, serial killer, str

In [27]:
print("\nValores únicos en 'Sentence':")
print(df_trial['Sentence'].unique())


Valores únicos en 'Sentence':
['Life imprisonment, Released, Incarcerated, Sentenced'
 'Died before sentence' 'Life imprisonment, Incarcerated, Sentenced'
 'Incarcerated, Released' 'Unknown' 'Sentenced'
 'Incarcerated, Released, Death penalty, Life imprisonment, Sentence commuted'
 'Execution, Sentenced' 'Execution, Incarcerated, Released, Sentenced'
 'Incarcerated, Sentenced'
 'Incarcerated, Released, Died before sentence, Life imprisonment, Execution, Sentenced'
 'Incarcerated, Sentenced, Parole' 'Incarcerated, Released, Sentenced'
 'Released, Died before sentence' 'Incarcerated' 'Sentenced, Released'
 'Life imprisonment, Died before sentence, Sentenced'
 'Life imprisonment, Released, Death penalty, Sentenced'
 'Incarcerated, Released, Death penalty, Life imprisonment, Sentenced, Parole'
 'Incarcerated, Released, Life imprisonment, Sentenced, Parole'
 'Life imprisonment, Sentenced, Incarcerated, Parole'
 'Incarcerated, Sentence commuted'
 'Sentenced, Died before sentence, Not identi

In [28]:
#Definimos las conciciones para considerar que una info no es relevante
#En 'Crimes': si es "Unknown" o simplemente "murder", pues es demasiado genericodemasiado genérico
no_info_crimes = df_trial['Crimes'].isin(["Unknown", "murder"])

# En 'Sentence': si el scraping falló o no encontró nada útil
no_info_penalties = df_trial['Sentence'].isin([
    "Unknown", "Not Found", "Page Error", "Error"])

#Teniendo eto, pasamos a filtrar los registros que cumplen ambas condiciones
df_no_info = df_trial[no_info_crimes & no_info_penalties]

display(df_no_info)


Unnamed: 0,Name,Crimes,Sentence
4,Abdullah Shah,murder,Unknown
30,Ali Kaya,murder,Unknown
38,Anatoly Sedykh,murder,Unknown
41,Anders Hansson,murder,Unknown
46,Andrew Dawson,murder,Unknown
52,Ann Arbor Hospital Killer,murder,Unknown
96,Bradley Edwards,murder,Unknown
105,Carl Gromann,murder,Unknown
129,Christopher Peterson,murder,Unknown
156,Darbara Singh,murder,Unknown


In [43]:
def categorize_crimes_priority(crimes_text):
    if pd.isnull(crimes_text):
        return "Other / Unknown"

    crimes_text = crimes_text.lower()

    if any(k in crimes_text for k in ["rape", "sexual abuse", "molestation", "sodomy", "pedophilia", "incest"]):
        return "Sexual Crime"

    if any(k in crimes_text for k in ["child abuse", "pedophilia"]):
        return "Child Abuse"

    if any(k in crimes_text for k in ["stalking", "bestiality", "necrophilia", "cannibalism"]):
        return "Psychological Crime"

    if any(k in crimes_text for k in ["murder", "strangulation", "beheading", "dismemberment", "torture", "castration"]):
        return "Violent Crime"

    if any(k in crimes_text for k in ["kidnapping", "abduction"]):
        return "Kidnapping"

    if any(k in crimes_text for k in ["robbery", "burglary", "arson"]):
        return "Property Crime"

    return "Other / Unknown"
df_trial["Crimes"] = df_trial["Crimes"].apply(categorize_crimes_priority)


In [40]:
def categorize_sentence_priority(sentence_text):
    if pd.isnull(sentence_text):
        return "Unknown / Error"

    sentence_text = sentence_text.lower()

    if any(k in sentence_text for k in ["death penalty", "execution"]):
        return "Death penalty"

    if any(k in sentence_text for k in ["life imprisonment", "multiple life sentences"]):
        return "Life imprisonment"

    if any(k in sentence_text for k in [
        "incarcerated", "sentenced", "parole", "released", "sentence commuted", "prison"
    ]):
        return "Other incarceration"

    if "psychiatric imprisonment" in sentence_text:
        return "Psychiatric sentence"

    if any(k in sentence_text for k in [
        "died before sentence", "committed suicide", "killed himself", "died in custody",
        "found dead", "shot himself"
    ]):
        return "Died before sentence"

    if any(k in sentence_text for k in ["not identified", "unidentified", "never caught", "unknown offender"]):
        return "Not identified"

    if any(k in sentence_text for k in ["unknown", "not found", "page error", "error"]):
        return "Unknown"

    return "Unknown / Error"
df_trial["Sentence"] = df_trial["Sentence"].apply(categorize_sentence_priority)


In [44]:
print("\nValores únicos en 'Crimes':")
print(df_trial['Crimes'].unique())
print("\nValores únicos en 'Sentence':")
print(df_trial['Sentence'].unique())


Valores únicos en 'Crimes':
['Other / Unknown' 'Kidnapping' 'Child Abuse']

Valores únicos en 'Sentence':
['Life imprisonment' 'Died before sentence' 'Unknown' 'Death penalty'
 'Not identified']


In [None]:
df_trial.info()

In [None]:
# Eliminar columnas originales
df_trial = df_trial.drop(columns=["Crimes", "Penalty"])

# Renombrar la columna final de penalización
df_trial = df_trial.rename(columns={"Penalty_Clean": "Penalty_Category"})


In [None]:
df_trial.info()

In [None]:
print("Valores únicos en 'Penalty_Clean':")
print(df_trial["Penalty_Category"].unique())

print("\nValores únicos en 'Crime_Category':")
print(df_trial["Crime_Category"].unique())


In [None]:
#guardamos el csv:
output_path = "Trial_data"
df_trial.to_csv(output_path, index=False)
print(f"Archivo guardado correctamente en: {output_path}")