In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
import math
import os

In [None]:
df = pd.read_csv('Final-data/final_final.csv')
df.info()

In [None]:
#Creamos la lista de names
names = df['Name'].dropna().unique().tolist()


In [None]:
#hacemos un web scraping de la api de wikipedia con el listado de los nombres de los asesinos y una serie de palabras clave
crime_keywords = [
    "abduction", "arson", "beheading", "bestiality", "burglary", "cannibalism", "castration", "child abuse", "dismemberment",
    "domestic violence", "hate crime", "incest", "kidnapping", "molestation", "murder", "necrophilia", "pedophilia", "rape",
    "robbery", "sexual abuse", "sodomy", "serial killer", "stalking", "strangulation", "terrorism", "torture"]

def get_crimes_from_wikipedia_api(name):
    try:
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": name,
            "format": "json"}
        response = requests.get(search_url, params=params)
        data = response.json()

        if not data['query']['search']:
            return "Not Found"

        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        page = requests.get(page_url)
        if page.status_code != 200:
            return "Page Error"

        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([p.get_text() for p in paragraphs[:25]])

        found_crimes = [kw for kw in crime_keywords if re.search(rf"\b{kw}\b", content.lower())]
        return ', '.join(found_crimes) if found_crimes else "murder"
    
    except Exception:
        return "Error"
#Para evitar que se colapse wikipedia o que nos bloquee, hacemos las busquedas por lotes que uniremos cuando los tengamos todos
batch_size = 100
num_batches = math.ceil(len(names) / batch_size)
batch_folder = "Primary-data/batches_crimes"
os.makedirs(batch_folder, exist_ok=True)

for i in range(num_batches):
    start = i * batch_size
    end = start + batch_size
    batch_names = names[start:end]
    results = []

    for name in batch_names:
        crimes = get_crimes_from_wikipedia_api(name)
        results.append({'Name': name, 'Crimes': crimes})
        sleep(0.5)

    batch_df = pd.DataFrame(results)
    batch_df.to_csv(f"{batch_folder}/crimes_batch_{i+1}.csv", index=False)


In [None]:
# Unir todos los batches de crimines cometidos
all_crimes_batches = []
for file in sorted(os.listdir("Primary-data/batches_crimes")):
    if file.endswith(".csv"):
        batch_path = os.path.join("Primary-data/batches_crimes", file)
        all_crimes_batches.append(pd.read_csv(batch_path))

df_crimes = pd.concat(all_crimes_batches, ignore_index=True)


In [None]:
sentence_keywords_map = {
    "death penalty": "Death penalty", "execution": "Execution", "life imprisonment": "Life imprisonment",
    "life sentence": "Life imprisonment", "multiple life sentences": "Multiple life sentences",
    "capital punishment": "Death penalty", "parole": "Parole", "released": "Released", "commuted": "Sentence commuted",
    "incarcerated": "Incarcerated", "prison": "Incarcerated", "sentenced to": "Sentenced",

    "unidentified": "Not identified", "not identified": "Not identified", "never caught": "Not identified",
    "unknown killer": "Not identified", "unknown offender": "Not identified",

    "committed suicide": "Died before sentence", "killed himself": "Died before sentence",
    "died before trial": "Died before sentence", "died in custody": "Died before sentence",
    "found dead": "Died before sentence", "shot himself": "Died before sentence"
}

def get_sentence_from_wikipedia_api(name):
    try:
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": name,
            "format": "json"}
        response = requests.get(search_url, params=params)
        data = response.json()

        if not data['query']['search']:
            return "Not Found"

        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        page = requests.get(page_url)
        if page.status_code != 200:
            return "Page Error"

        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([p.get_text() for p in paragraphs[:25]]).lower()

        found = set()
        for keyword, label in sentence_keywords_map.items():
            if re.search(rf"\b{re.escape(keyword)}\b", content):
                found.add(label)

        return ', '.join(found) if found else "Unknown"
    
    except Exception as e:
        print(f"❌ Error con {name}: {e}")
        return "Error"

#Procesamos y guardamos por lotes para evitar que la api se enfade y nos bloquee
batch_size = 100
num_batches = math.ceil(len(names) / batch_size)
batch_folder = "Primary-data/batches_sentence"
os.makedirs(batch_folder, exist_ok=True)

for i in range(num_batches):
    start = i * batch_size
    end = start + batch_size
    batch_names = names[start:end]
    results = []

    for name in batch_names:
        penalty = get_sentence_from_wikipedia_api(name)
        results.append({'Name': name, 'Sentence': penalty})  # usamos "Sentence" directamente

        sleep(0.5)

    batch_df = pd.DataFrame(results)
    batch_df.to_csv(f"{batch_folder}/sentence_batch_{i+1}.csv", index=False)

#unimos todos los batches


In [None]:
# Unir todos los batches de sentencias
all_sentence_batches = []
for file in sorted(os.listdir("Primary-data/batches_sentence")):
    if file.endswith(".csv"):
        batch_path = os.path.join("Primary-data/batches_sentence", file)
        all_sentence_batches.append(pd.read_csv(batch_path))

df_sentence = pd.concat(all_sentence_batches, ignore_index=True)


In [None]:
#unimos los dos df que hemos sacado de las busqueda
df_trial = pd.merge(df_crimes, df_sentence, on="Name", how="outer")
df_trial.info()
#y revisamos nulos tras el merge
print(df_trial.isna().sum())


In [None]:
print("Valores únicos en 'Crimes':")
print(df_trial['Crimes'].unique())


In [None]:
print("\nValores únicos en 'Sentence':")
print(df_trial['Sentence'].unique())

In [None]:
#creamos una funcion para normalizar los calores de las columnas
def clean_crimes_column(text):
    if pd.isnull(text) or str(text).strip().lower() in ["not found", "", "nan"]:
        return "unknown"
    text = text.lower()
    text = re.sub(r'[^a-z,\s]', '', text)
    text = re.sub(r'\s*,\s*', ',', text)
    text = re.sub(r'\s+', ' ', text).strip()
    crimes_list = list(dict.fromkeys(text.split(",")))
    return ",".join(crimes_list)



In [None]:
#y la aplicamos
df_trial["Crimes"] = df_trial["Crimes"].apply(clean_crimes_column)
df_trial["Sentence"] = df_trial["Sentence"].apply(clean_crimes_column)



In [None]:
#verificamos
print(df_trial["Crimes"].unique())
print(df_trial["Sentence"].unique())


In [None]:
#Definimos las conciciones para considerar que una info no es relevante
#En 'Crimes': si es "Unknown" o simplemente "murder", pues es demasiado genericodemasiado genérico
no_info_crimes = df_trial['Crimes'].isin(["Unknown", "murder"])

# En 'Sentence': si el scraping falló o no encontró nada útil
no_info_penalties = df_trial['Sentence'].isin([
    "Unknown", "Not Found", "Page Error", "Error"])

#Teniendo eto, pasamos a filtrar los registros que cumplen ambas condiciones
df_no_info = df_trial[no_info_crimes & no_info_penalties]

display(df_no_info)


In [None]:
df_trial.info()

In [None]:
#guardamos el csv:
output_path = "Final-data/Trial_data"
df_trial.to_csv(output_path, index=False)
print(f"Archivo guardado correctamente en: {output_path}")