In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
import math
import os


In [None]:

# Cargar archivo CSV
df = pd.read_csv('Primary-data/killers-limpio.csv')
names = df['Name'].dropna().unique()

In [None]:

#Ahora obtenemos la info del género desde el contenido de Wikipedia
def infer_gender_from_wikipedia(name):
    try:
        clean_name = name.replace('"', '').strip()

        # Buscar el artículo más relevante usando la API
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": clean_name,
            "format": "json"
        }
        response = requests.get(search_url, params=params)
        data = response.json()

        if not data['query']['search']:
            return "Unknown"

        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        page = requests.get(page_url)
        if page.status_code != 200:
            return "Unknown"

        # Extraer los primeros párrafos
        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([p.get_text() for p in paragraphs[:3]]).lower()

        # Reglas básicas basadas en pronombres
        if any(w in content for w in [' she ', ' her ', 'herself']):
            return 'female'
        elif any(w in content for w in [' he ', ' his ', 'himself']):
            return 'male'
        else:
            return 'Unknown'
    except:
        return 'Unknown'

results = []
for i, name in enumerate(names):
    gender = infer_gender_from_wikipedia(name)
    results.append({'Name': name, 'Gender': gender})


In [None]:
#Creamos un nuevo df con resultados
df_gender = pd.DataFrame(results)

In [None]:
#HAcemos una lista de palabras clave para detectar las armas urtilizadas por cada asesino.
weapon_keywords = [
    'gun', 'firearm', 'revolver', 'pistol', 'shot', 'shooting', 'knife', 'stabbed', 'stabbing', 'machete', 'axe',
    'blunt object', 'hammer', 'bat', 'club', 'strangled', 'strangulation', 'garrote', 'choked', 'suffocated',
    'poison', 'poisoned', 'arsenic', 'cyanide', 'chloroform', 'injected', 'hatchet', 'razor',
    'fire', 'burned', 'burnt', 'set on fire', 'incinerated', 'drowned', 'drowning',
    'explosives', 'bomb', 'grenade', 'decapitated', 'beheaded', 'hanged', 'hanging', 'sawed', 'cut into pieces',
    'smothered', 'beat to death', 'run over', 'car', 'vehicle', 'truck','tortured', 'disemboweled', 'acid', 'chemical']

In [None]:
# Scrapeo con API Wikipedia
def get_weapon_from_wikipedia_api(name):
    try:
        clean_name = name.replace('"', '').strip()
        search_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": clean_name,
            "format": "json"}
        response = requests.get(search_url, params=params)
        data = response.json()

        if not data['query']['search']:
            return "Not Found"

        title = data['query']['search'][0]['title']
        page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        page = requests.get(page_url)
        if page.status_code != 200:
            return "Page Error"

        soup = BeautifulSoup(page.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([p.get_text() for p in paragraphs[:25]])

        found_weapons = [kw for kw in weapon_keywords if re.search(rf"\b{kw}\b", content.lower())]
        return ', '.join(found_weapons) if found_weapons else "Unknown"
    
    except Exception:
        return "Error"

# Parámetros de lote
batch_size = 100
num_batches = math.ceil(len(names) / batch_size)

# Crear carpeta para guardar resultados si no existe
os.makedirs("Primary-data/batches_weapon", exist_ok=True)

# Ejecutar scraping por lotes
for i in range(num_batches):
    start = i * batch_size
    end = start + batch_size
    batch_names = names[start:end]
    results = []

    for name in batch_names:
        weapon = get_weapon_from_wikipedia_api(name)
        results.append({'Name': name, 'Weapon': weapon})
        sleep(0.5)

    batch_df = pd.DataFrame(results)
    batch_df.to_csv(f"Primary-data/batches_weapon/weapons_batch_{i+1}.csv", index=False)


In [None]:
# Unir todos los lotes en un solo pd final
all_batches = []
for file in sorted(os.listdir("Primary-data/batches_weapon")):
    if file.endswith(".csv"):
        batch_path = os.path.join("Primary-data/batches_weapon", file)
        all_batches.append(pd.read_csv(batch_path))

df_weapons = pd.concat(all_batches, ignore_index=True)

In [None]:
#creamos una funcion para normalizar los valores tras el scrapeo
def clean_text_columns_custom(df):
    for col in df.select_dtypes(include='object').columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.replace('"', '', regex=False)
            .str.replace("'", '', regex=False)
            .str.replace(r"\s+", " ", regex=True))

        if col == "Name":
            df[col] = df[col].str.title()
        elif col in ["Gender", "Weapon"]:
            df[col] = df[col].str.lower()
    
    return df

#aplicamos la funcionnen los dos df
df_gender = clean_text_columns_custom(df_gender)
df_weapons = clean_text_columns_custom(df_weapons)


In [None]:
#unimos los resultados:
df_scrapeo1 = pd.merge(df_gender, df_weapons, on="Name", how="left")


In [None]:
#Tras la union de los resultados vamos a ver si tenemos valores irregulares, duplicados o nulos.
print("Valores nulos por columna:")
print(df_scrapeo1.isnull().sum())
print(f"duplicados completos{df_scrapeo1.duplicated().sum()}")
# Mostrar ejemplos de filas duplicadas completas
print("Ejemplos de duplicados:")
print(df_scrapeo1[df_scrapeo1.duplicated()])




In [None]:
#Eliminamos los duplicados quedandonos solo con la primera aparicion
#Ordenamos para priorizar que la primera aparición sea la que tenga todos los registros completos
df_scrapeo1["completo"] = ~((df_scrapeo1["Gender"] == "unknown") | (df_scrapeo1["Weapon"] == "unknown"))
df_scrapeo1 = df_scrapeo1.sort_values(by="completo", ascending=False)
df_scrapeo1 = df_scrapeo1.drop_duplicates(subset="Name", keep="first")
df_scrapeo1.drop(columns="completo", inplace=True)

# y verificamos
print(f"Shape después de eliminar duplicados: {df_scrapeo1.shape}")


In [None]:
#completamos manualmente varios registros de los que tenemos ciencia cierta
df_scrapeo1.loc[df_scrapeo1['Name'] == 'Rainbow Maniac', 'Gender'] = 'male'
df_scrapeo1.loc[df_scrapeo1['Name'] == 'The Family', 'Gender'] = 'male'
df_scrapeo1.loc[df_scrapeo1['Name'] == 'Bian Kuang, Fu Xinyuan And Luo Lianshun', 'Gender'] = 'male'

In [None]:
# Filas con 'unknown' en Gender
unknown_gender = df_scrapeo1[df_scrapeo1["Gender"] == "unknown"]
print(f"Filas con Gender = 'unknown': {len(unknown_gender)}")

# Filas con 'unknown' en Weapon
unknown_weapon = df_scrapeo1[df_scrapeo1["Weapon"] == "unknown"]
print(f"Filas con Weapon = 'unknown': {len(unknown_weapon)}")


In [None]:
#guardamos el csv:
output_path = "Primary-data/scrapeo1.csv"
df_scrapeo1.to_csv(output_path, index=False)
print(f"Archivo guardado correctamente en: {output_path}")