In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
import math
import os

In [None]:

# Cargar archivo CSV
df = pd.read_csv('final_final.csv')
df.info()




In [None]:
crime_keywords = ["cannibalism", "rape", "sexual abuse", "child abuse", "pedophilia", "necrophilia", 
                  "torture", "kidnapping", "arson", "robbery", "burglary", "incest", "animal cruelty", 
                  "stalking", "hate crime", "terrorism", "domestic violence", "human trafficking"]


In [None]:
#Creamos la lista de names
names = df['Name'].dropna().unique().tolist()
print(f"Total de nombres únicos: {len(names)}")


In [None]:
def get_crimes_from_wikipedia_api(name):
    try:
        clean_name = name.replace('"', '').strip()
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": clean_name,
            "format": "json"}
        response = requests.get(search_url, params=params)
        data = response.json()

        if not data['query']['search']:
            return "Not Found"

        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        page = requests.get(page_url)
        if page.status_code != 200:
            return "Page Error"

        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([p.get_text() for p in paragraphs[:25]])

        found_crimes = [kw for kw in crime_keywords if re.search(rf"\b{kw}\b", content.lower())]
        return ', '.join(found_crimes) if found_crimes else "murder"
    
    except Exception:
        return "Error"

# Asumimos que ya tienes `names` cargado con los nombres únicos
batch_size = 100
num_batches = math.ceil(len(names) / batch_size)

os.makedirs("batches_crimes", exist_ok=True)

for i in range(num_batches):
    start = i * batch_size
    end = start + batch_size
    batch_names = names[start:end]
    results = []

    for name in batch_names:
        crimes = get_crimes_from_wikipedia_api(name)
        results.append({'Name': name, 'Crimes': crimes})
        sleep(0.5)

    batch_df = pd.DataFrame(results)
    batch_df.to_csv(f"batches_crimes/crimes_batch_{i+1}.csv", index=False)

all_crime_batches = []
for file in sorted(os.listdir("batches_crimes")):
    if file.endswith(".csv"):
        batch_path = os.path.join("batches_crimes", file)
        all_crime_batches.append(pd.read_csv(batch_path))

df_crimes = pd.concat(all_crime_batches, ignore_index=True)
df_crimes.to_csv("crimes.csv", index=False)

print("Scraping de crímenes completo. Archivo final guardado como 'crimes.csv'")



In [None]:
penalty_keywords_map = {"death penalty": "Death penalty", "execution": "Execution", "life imprisonment": "Life imprisonment", "life sentence": "Life imprisonment",
    "multiple life sentences": "Multiple life sentences", "capital punishment": "Death penalty", "parole": "Parole", "released": "Released", "commuted": "Sentence commuted", 
    "incarcerated": "Incarcerated", "prison": "Incarcerated", "sentenced to": "Sentenced",

    "unidentified": "Not identified", "not identified": "Not identified", "never caught": "Not identified", "unknown killer": "Not identified", "unknown offender": "Not identified",

    "committed suicide": "Died before sentence", "killed himself": "Died before sentence", "died before trial": "Died before sentence", "died in custody": "Died before sentence", 
    "found dead": "Died before sentence", "shot himself": "Died before sentence"}


In [None]:
def get_penalty_from_wikipedia_api(name):
    try:
        clean_name = name.replace('"', '').strip()
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": clean_name,
            "format": "json"
        }
        response = requests.get(search_url, params=params)
        data = response.json()

        if not data['query']['search']:
            return "Not Found"

        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        page = requests.get(page_url)
        if page.status_code != 200:
            return "Page Error"

        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([p.get_text() for p in paragraphs[:25]]).lower()

        found = set()
        for keyword, label in penalty_keywords_map.items():
            if re.search(rf"\b{re.escape(keyword)}\b", content):
                found.add(label)

        return ', '.join(found) if found else "Unknown"
    
    except Exception:
        return "Error"

batch_size = 100
num_batches = math.ceil(len(names) / batch_size)

os.makedirs("batches_penalties", exist_ok=True)

for i in range(num_batches):
    start = i * batch_size
    end = start + batch_size
    batch_names = names[start:end]
    results = []

    for name in batch_names:
        penalty = get_penalty_from_wikipedia_api(name)
        results.append({'Name': name, 'Penalty': penalty})
        sleep(0.5)

    batch_df = pd.DataFrame(results)
    batch_df.to_csv(f"batches_penalties/penalty_batch_{i+1}.csv", index=False)

all_penalty_batches = []
for file in sorted(os.listdir("batches_penalties")):
    if file.endswith(".csv"):
        batch_path = os.path.join("batches_penalties", file)
        all_penalty_batches.append(pd.read_csv(batch_path))

df_penalties = pd.concat(all_penalty_batches, ignore_index=True)
df_penalties.to_csv("penalties.csv", index=False)

print("Scraping de penas completado. Archivo final guardado como 'penalties.csv'")



In [17]:
# Cargamos los archivos que acabamos de crear y los unimos por nombre
df_crimes = pd.read_csv("crimes.csv")
df_penalties = pd.read_csv("penalties.csv")
df_trial = pd.merge(df_crimes, df_penalties, on="Name", how="outer")

# Normalizar valores para filtrar bien
df_trial['Crimes'] = df_trial['Crimes'].fillna("Unknown")
df_trial['Penalty'] = df_trial['Penalty'].fillna("Unknown")

# Definir condiciones de "sin información útil"
no_info_crimes = df_trial['Crimes'].isin(["Unknown", "murder"])
no_info_penalties = df_trial['Penalty'].isin(["Unknown", "Not Found", "Page Error", "Error"])

# Filtrar los casos con ambas condiciones
df_no_info = df_trial[no_info_crimes & no_info_penalties]
display(df_no_info)


Unnamed: 0,Name,Crimes,Penalty
4,Abdullah Aksoy,murder,Unknown
5,Abdullah Shah,murder,Unknown
31,Ali Kaya,murder,Unknown
39,Anatoly Sedykh,murder,Unknown
42,Anders Hansson,murder,Unknown
...,...,...,...
745,William Macdonald,murder,Unknown
748,William Unek,murder,Unknown
751,Wolfgang Schmidt,murder,Unknown
756,Yevgeny Petrov,murder,Unknown


In [18]:
print("Valores únicos en 'Crimes':")
print(df_trial['Crimes'].unique())

print("\nValores únicos en 'Penalty':")
print(df_trial['Penalty'].unique())


Valores únicos en 'Crimes':
['robbery' 'cannibalism' 'murder' 'rape' 'torture'
 'rape, kidnapping, robbery' 'rape, robbery' 'torture, kidnapping'
 'cannibalism, rape' 'burglary' 'kidnapping' 'torture, robbery' 'arson'
 'rape, sexual abuse' 'rape, torture' 'rape, kidnapping' 'sexual abuse'
 'cannibalism, arson, burglary' 'pedophilia, necrophilia'
 'rape, sexual abuse, kidnapping' 'robbery, burglary, terrorism'
 'rape, arson, burglary' 'stalking' 'rape, burglary'
 'kidnapping, stalking' 'rape, arson, robbery, burglary'
 'rape, arson, robbery' 'sexual abuse, kidnapping'
 'pedophilia, kidnapping, burglary' 'robbery, burglary'
 'rape, torture, stalking' 'arson, animal cruelty' 'child abuse'
 'rape, incest' 'rape, kidnapping, stalking'
 'rape, torture, burglary, human trafficking' 'rape, stalking'
 'rape, robbery, burglary' 'kidnapping, burglary'
 'rape, pedophilia, necrophilia' 'rape, necrophilia'
 'rape, burglary, stalking' 'kidnapping, robbery'
 'cannibalism, rape, necrophilia' 'sexual ab

In [19]:
# Lista de etiquetas prioritarias
penalty_priority = [
    "Death penalty",
    "Execution",
    "Multiple life sentences",
    "Life imprisonment",
    "Psychiatric imprisonment",
    "Sentence commuted",
    "Parole",
    "Incarcerated",
    "Released",
    "Died before sentence",
    "Not identified",
    "Unknown",
    "Error"
]

# Función para extraer la etiqueta principal más grave
def extract_main_penalty(penalty_str):
    if pd.isnull(penalty_str):
        return "Unknown"
    for keyword in penalty_priority:
        if keyword.lower() in penalty_str.lower():
            return keyword
    return "Unknown"

# Aplicar al DataFrame
df_trial["Penalty_Clean"] = df_trial["Penalty"].apply(extract_main_penalty)


In [20]:
def simplify_penalty_clean(penalty_clean):
    if penalty_clean in ["Death penalty", "Execution"]:
        return "Death penalty"
    elif penalty_clean in ["Life imprisonment", "Multiple life sentences"]:
        return "Life imprisonment"
    elif penalty_clean in ["Psychiatric imprisonment", "Sentence commuted"]:
        return "Psychiatric/Other"
    elif penalty_clean in ["Parole", "Incarcerated", "Released"]:
        return "Incarceration"
    elif penalty_clean in ["Died before sentence", "Not identified", "Unknown", "Error"]:
        return "No sentence / Unknown"
    else:
        return "Other"


In [21]:
df_trial["Penalty_Clean"] = df_trial["Penalty_Clean"].apply(simplify_penalty_clean)


In [22]:
def categorize_crime(crimes):
    crimes = crimes.lower() if isinstance(crimes, str) else ""
    if any(c in crimes for c in ["rape", "sexual abuse", "pedophilia", "incest"]):
        return "Sexual crimes"
    elif any(c in crimes for c in ["murder", "torture", "kidnapping", "child abuse", "domestic violence"]):
        return "Violent crimes"
    elif any(c in crimes for c in ["cannibalism", "necrophilia"]):
        return "Cannibalism"
    elif any(c in crimes for c in ["robbery", "burglary", "arson"]):
        return "Property crimes"
    elif any(c in crimes for c in ["stalking", "terrorism", "hate crime", "animal cruelty", "human trafficking"]):
        return "Other crimes"
    else:
        return "Unknown"


In [23]:
df_trial['Crime_Category'] = df_trial['Crimes'].apply(categorize_crime)


In [24]:
df_trial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            768 non-null    object
 1   Crimes          768 non-null    object
 2   Penalty         768 non-null    object
 3   Penalty_Clean   768 non-null    object
 4   Crime_Category  768 non-null    object
dtypes: object(5)
memory usage: 30.1+ KB


In [25]:
# Eliminar columnas originales
df_trial = df_trial.drop(columns=["Crimes", "Penalty"])

# Renombrar la columna final de penalización
df_trial = df_trial.rename(columns={"Penalty_Clean": "Penalty_Category"})


In [26]:
df_trial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Name              768 non-null    object
 1   Penalty_Category  768 non-null    object
 2   Crime_Category    768 non-null    object
dtypes: object(3)
memory usage: 18.1+ KB


In [27]:
print("Valores únicos en 'Penalty_Clean':")
print(df_trial["Penalty_Category"].unique())

print("\nValores únicos en 'Crime_Category':")
print(df_trial["Crime_Category"].unique())


Valores únicos en 'Penalty_Clean':
['Life imprisonment' 'Death penalty' 'No sentence / Unknown'
 'Incarceration' 'Psychiatric/Other']

Valores únicos en 'Crime_Category':
['Property crimes' 'Cannibalism' 'Violent crimes' 'Sexual crimes'
 'Other crimes' 'Unknown']


In [28]:
#guardamos el csv:
output_path = "Trial_data"
df_trial.to_csv(output_path, index=False)
print(f"Archivo guardado correctamente en: {output_path}")

Archivo guardado correctamente en: Trial_data
