In [250]:
import pandas as pd
import numpy as np
import re
import spacy
from spacy.util import minibatch, compounding
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.training.example import Example
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import random

In [251]:
# Cargar los datos
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('dataset_agosto2024.csv')
geolocator = Nominatim(user_agent="Sophia")

In [252]:
# Cargar el modelo de spaCy para español
nlp = spacy.load("es_core_news_sm")

In [253]:
unique_classes = train_df["clase"].unique()

In [254]:
# Obtén 1000 filas aleatorias del DataFrame
n_datos = len(train_df)#Se estan tomando todas las filas
random_rows = train_df.sample(n=n_datos, random_state=42)

In [255]:
text_to_use = "title" #Puede ser title o text

# Preparar los datos para el entrenamiento
train_data = [
    (row[text_to_use], {"cats": {cls: (cls == row['clase']) for cls in unique_classes}})
    for _, row in random_rows.iterrows()
]


In [256]:
train_examples = []

for example in train_data:
    train_examples.append(Example.from_dict(nlp.make_doc(example[0]), example[1]))

def get_examples():
    return train_examples

In [257]:
model = {
            "@architectures": "spacy.TextCatCNN.v2",
            "exclusive_classes": True,
            "tok2vec": DEFAULT_TOK2VEC_MODEL,
        }

In [258]:
# Add the textcat component to the pipeline
textcat = nlp.add_pipe("textcat", config={"model": model})

In [259]:
textcat.initialize(get_examples)

In [260]:
iteraciones = 10

# Entrenar el modelo
with nlp.select_pipes(enable="textcat"):
    optimizer = nlp.begin_training()
    for epoch in range(iteraciones):
        losses = {}
        random.shuffle(train_data)
        # Dividir los datos en lotes y actualizar el modelo
        for batch in minibatch(train_data, size=compounding(4.0, 32.0, 1.001)):
            texts, annotations = zip(*batch)
            example = []
            # Actualizar el modelo con iteraciones
            for i in range(len(texts)):
                doc = nlp.make_doc(texts[i])
                example.append(Example.from_dict(doc, annotations[i]))
            nlp.update(example, drop=0.5, losses=losses)
        print(losses)

{'textcat': 154.31192181259394}
{'textcat': 145.2422415278852}
{'textcat': 139.06601426377892}
{'textcat': 134.36419315449893}
{'textcat': 130.19400797970593}
{'textcat': 127.0931301843375}
{'textcat': 124.85290009621531}
{'textcat': 121.09695628285408}
{'textcat': 119.61362112534698}
{'textcat': 117.42926041875035}


In [261]:
# Guardar el modelo entrenado en disco
nlp.to_disk("model_1000_CNNv2")

# Cargar el modelo entrenado
nlp_loaded = spacy.load("model_1000_CNNv2")#modelo_clasificador_noticias")

In [268]:
# Clasificar las noticias del conjunto de prueba
random_test_data = test_df.sample(n=100, random_state=42)
# test_data = random_test_data["title"].values


In [263]:
comunas = pd.read_csv("comunas.csv", sep=";")
comunaslist = comunas["Nombre"].values
paises = pd.read_csv("paises.csv")
paisesList = paises["nombre"].values
strings_limpios = [s.strip(' "\'') for s in paisesList]

In [264]:
# Función para clasificar texto
def classify_text(text):
    doc = nlp(text)
    scores = doc.cats
    return max(scores, key=scores.get)

# Función para extraer el evento principal
def extract_event(doc):
    for token in doc:
        if token.pos_ == "VERB":
            event = token.text
            for child in token.children:
                if child.dep_ in ["nsubj", "dobj"]:
                    event = f"{child.text} {event}"
            return event
    return "No se pudo extraer el evento"

# Función para extraer la dirección
def extract_address(doc):
    # Variables para almacenar diferentes tipos de locaciones
    direcciones = []
    comunas_encontradas = []
    paises_encontrados = []

    # Extraer entidades de tipo GPE
    for ent in doc.ents:
        if ent.label_ in ["LOC", "GPE"] :
            # Clasificar en comuna, país o dirección
            if ent.text in comunaslist:
                comunas_encontradas.append(ent.text)
            elif ent.text in strings_limpios:
                paises_encontrados.append(ent.text)
            else:
                direcciones.append(ent.text)
    responseText = ""
    if len(direcciones) > 0:
        responseText += direcciones[0]
    if len(comunas_encontradas) > 0:
        responseText += ", " + comunas_encontradas[0]
    if len(paises_encontrados) > 0:
        responseText += ", " + paises_encontrados[0] 
    else: responseText += ", Chile"
    return responseText

# Configurar el geocodificador
geolocator = Nominatim(user_agent="my_agent")

# Función para geocodificar una dirección
def geocode_address(address):
    try:
        location = geolocator.geocode(address)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except GeocoderTimedOut:
        return None, None

In [269]:
results = []
for _, row in random_test_data.iterrows():
    doc = nlp(row['text'])
    event = extract_event(doc)
    category = classify_text(row['text'])
    address = extract_address(doc)
    lat, lon = geocode_address(address)
    
    results.append({
        'id_news': row['id_news'],
        'event': event,
        'category': category,
        'address': address,
        'latitud': lat,
        'longitud': lon
    })

In [270]:
results_df = pd.DataFrame(results)

In [271]:
results_df.to_csv('output_results.csv', index=False)