In [38]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from methods_scripts.context_methods import *
from methods_scripts.get_data import *
from methods_scripts.preprocesamiento import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import numpy as np

### Obteniendo los datos para trabajar

In [2]:
texto_completo, articles_map = get_texto_completo()
df_entities = get_df_entities()
df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=3)


Size: (414, 6)


In [3]:
# Vectorizando el conjunto de entrenamiento
vectorizer = get_vectorizer('countvectorizer')
# Vectorizando el texto de todos las news
vectorizer.fit(texto_completo)

x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)

### Iniciando el entrenamiento con ML supervisado

In [45]:
def test_algorithms(x_train, y_train):
  
  X = np.array(x_train)
  y = np.array(y_train)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

  models = {
      "Random Forest": {
          "model": RandomForestClassifier(random_state=42),
          "params": {
              "n_estimators": [50, 100, 200],
              "max_depth": [None, 10, 20, 30],
              "min_samples_split": [2, 5, 10]
          }
      },
      "Logistic Regression": {
          "model": LogisticRegression(max_iter=1000, random_state=42),
          "params": {
              "C": [0.01, 0.1, 1, 10],
              "solver": ["lbfgs", "liblinear"]
          }
      },
      "KNN": {
          "model": KNeighborsClassifier(),
          "params": {
              "n_neighbors": [3],
              "weights": ["uniform", "distance"],
              "metric": ["euclidean", "manhattan"]
          }
      },
      "Multinomial Naive Bayes": {
          "model": MultinomialNB(),
          "params": {
              "alpha": [0.1, 0.5, 1, 5, 10]
          }
      }
  }

  # Entrena y evalúa cada modelo con GridSearchCV
  for name, config in models.items():
      print(f"\nEntrenando y ajustando: {name}")

      # Configura la búsqueda de hiperparámetros
      grid = GridSearchCV(estimator=config["model"], param_grid=config["params"], cv=5, scoring="roc_auc_ovr", n_jobs=-1)
      grid.fit(X_train, y_train)

      # Mejores parámetros
      print(f"Mejores parámetros: {grid.best_params_}")

      # Predicción en el conjunto de prueba
      y_pred = grid.best_estimator_.predict(X_test)

      # Evaluación
      print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
      print("Classification Report:")
      print(classification_report(y_test, y_pred, zero_division=0))
      print(f"- Mejor puntaje ROC AUC OVR: {grid.best_score_:.2f}\n\n")


**Evaluacion**

In [46]:
for ventana in range(1,11):
  print(f"############ Ventana usada: {ventana} ############")
  
  df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=ventana)
  x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)
  test_algorithms(x_train,y_train)

  print("######################################################\n")

############ Ventana usada: 1 ############

Entrenando y ajustando: Random Forest
Mejores parámetros: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.62
Classification Report:
              precision    recall  f1-score   support

  Antagonist       0.62      1.00      0.76        24
    Innocent       0.00      0.00      0.00         3
 Protagonist       1.00      0.08      0.14        13

    accuracy                           0.62        40
   macro avg       0.54      0.36      0.30        40
weighted avg       0.69      0.62      0.50        40

- Mejor puntaje ROC AUC OVR: 0.80



Entrenando y ajustando: Logistic Regression
Mejores parámetros: {'C': 1, 'solver': 'liblinear'}
Accuracy: 0.68
Classification Report:
              precision    recall  f1-score   support

  Antagonist       0.74      0.83      0.78        24
    Innocent       0.00      0.00      0.00         3
 Protagonist       0.54      0.54      0.54        13

    accuracy                

**Conclusiones**:

Se concluye que aproximadamente se tiene mejor comportamiento con un tamaño de ventana de 3 o 5 palabras.

Así mismo, Logistic Regression y RandomForest son buenos candidas. No obstante, podemos seguir considerando usar knn como potencial candidato debido a que parece no tender a hacer un sobreajuste.

In [47]:
# Logistic Regression
Logistic_Regression_parameters = {'C': 1, 'solver': 'lbfgs'}

# Random Forest
Random_Forest_parameters = {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}

### Usando OneVsRestClasffier

In [21]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import numpy as np


# Dividir datos
df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=5)
x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)

X_train, X_test, y_train, y_test = train_test_split(
    x_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

# Diccionario de modelos
models = {
    "LogisticRegression": LogisticRegression(),
    "LinearSVC": LinearSVC(),
    "RandomForest": RandomForestClassifier(),
    "MultinomialNB": MultinomialNB()
}

# Hiperparámetros para cada modelo
param_grids = {
    "LogisticRegression": {"estimator__C": [0.1, 1, 10]},
    "LinearSVC": {"estimator__C": [0.1, 1, 10]},
    "RandomForest": {"estimator__n_estimators": [50, 100, 200]},
    "MultinomialNB": {"estimator__alpha": [0.1, 1, 10]}
}

# Entrenar y evaluar
for name, model in models.items():
    print(f"Training {name}...")
    
    # Crear OneVsRestClassifier
    classifier = OneVsRestClassifier(model)
    
    # Búsqueda de hiperparámetros
    grid_search = GridSearchCV(
        classifier, 
        param_grid=param_grids[name], 
        scoring='roc_auc_ovr', 
        cv=5
    )
    grid_search.fit(X_train, y_train)
    
    # Mejor modelo
    best_model = grid_search.best_estimator_
    print(f"Best params for {name}: {grid_search.best_params_}")
    
    # Predicciones y evaluación
    y_pred = best_model.predict(X_test)
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_test, y_pred, target_names=['antagonista', 'protagonista', 'inocente']))
    print("\n")


Training LogisticRegression...
Best params for LogisticRegression: {'estimator__C': 0.1}
Classification Report for LogisticRegression:

              precision    recall  f1-score   support

 antagonista       0.63      1.00      0.77        24
protagonista       0.00      0.00      0.00         3
    inocente       1.00      0.15      0.27        13

    accuracy                           0.65        40
   macro avg       0.54      0.38      0.35        40
weighted avg       0.70      0.65      0.55        40



Training LinearSVC...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Traceback (most recent call last):
  File "C:\Users\Usuario\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\metrics\_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\P

Best params for LinearSVC: {'estimator__C': 0.1}
Classification Report for LinearSVC:

              precision    recall  f1-score   support

 antagonista       0.65      0.83      0.73        24
protagonista       0.00      0.00      0.00         3
    inocente       0.50      0.31      0.38        13

    accuracy                           0.60        40
   macro avg       0.38      0.38      0.37        40
weighted avg       0.55      0.60      0.56        40



Training RandomForest...
Best params for RandomForest: {'estimator__n_estimators': 100}
Classification Report for RandomForest:

              precision    recall  f1-score   support

 antagonista       0.65      1.00      0.79        24
protagonista       0.00      0.00      0.00         3
    inocente       1.00      0.23      0.38        13

    accuracy                           0.68        40
   macro avg       0.55      0.41      0.39        40
weighted avg       0.71      0.68      0.59        40



Training Multinomi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Usando XGBOOST

In [25]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelBinarizer

# Dividir datos
df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=5)
x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)

X_train, X_test, y_train, y_test = train_test_split(
    x_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

# Modelo base: XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Crear el clasificador OneVsRest
ovr_classifier = OneVsRestClassifier(xgb_model)

# Parámetros para GridSearch
param_grid = {
    "estimator__learning_rate": [0.01, 0.1, 0.2],
    "estimator__max_depth": [3, 5, 7],
    "estimator__n_estimators": [50, 100, 200],
}

# Realizar búsqueda de hiperparámetros
grid_search = GridSearchCV(
    ovr_classifier,
    param_grid=param_grid,
    scoring='roc_auc_ovr',
    cv=3
)

# Entrenamiento del modelo
grid_search.fit(X_train, y_train)

# Mejor modelo
best_model = grid_search.best_estimator_
print(f"Mejores parámetros: {grid_search.best_params_}")

# Predicciones
y_pred = best_model.predict(X_test)

# Evaluación
print("Reporte de Clasificación:\n")
print(classification_report(y_test, y_pred, target_names=['antagonista', 'protagonista', 'inocente']))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Mejores parámetros: {'estimator__learning_rate': 0.1, 'estimator__max_depth': 5, 'estimator__n_estimators': 50}
Reporte de Clasificación:

              precision    recall  f1-score   support

 antagonista       0.59      0.79      0.68        24
protagonista       0.00      0.00      0.00         3
    inocente       0.33      0.15      0.21        13

    accuracy                           0.53        40
   macro avg       0.31      0.32      0.30        40
weighted avg       0.46      0.53      0.48        40



### Lidiando con datos no balanceados

El problema es que existen un gran desbalance entre los datos. Práctica la mayor parte es antagonista lo cual es un gran sesgo de información.

La propuesta para intentar lidiar con esto es usando un ensamble basado en bagging.

In [62]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=3)
x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)
X_train, X_test, y_train, y_test = train_test_split(
    x_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

for n_estimador in range(3,10):
    print(f"################ Using {n_estimador} estimadors: ################")
    base_model = DecisionTreeClassifier(random_state=42)
    bagging_model = BaggingClassifier(base_model, n_estimators=n_estimador, random_state=42)
    bagging_model.fit(X_train, y_train)

    y_pred = bagging_model.predict(X_test)

    print(f"Classification Report for {'DecisionTreeClassifier'}:\n")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("####################################################################\n")


################ Using 3 estimadors: ################
Classification Report for DecisionTreeClassifier:

              precision    recall  f1-score   support

  Antagonist       0.74      0.58      0.65        24
    Innocent       0.00      0.00      0.00         3
 Protagonist       0.43      0.69      0.53        13

    accuracy                           0.57        40
   macro avg       0.39      0.43      0.39        40
weighted avg       0.58      0.57      0.56        40

####################################################################

################ Using 4 estimadors: ################
Classification Report for DecisionTreeClassifier:

              precision    recall  f1-score   support

  Antagonist       0.68      0.88      0.76        24
    Innocent       0.00      0.00      0.00         3
 Protagonist       0.44      0.31      0.36        13

    accuracy                           0.62        40
   macro avg       0.37      0.39      0.38        40
weighted avg 

Podemos observar que se tiene mejor comportamiento usar 6. Vamos a explorar otros algoritmos

In [69]:
df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=3)
x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)
X_train, X_test, y_train, y_test = train_test_split(
    x_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

for n_estimador in range(3,10):
    print(f"################ Using {n_estimador} estimadors: ################")
    base_model = LogisticRegression(C = 1, solver ='lbfgs')
    bagging_model = BaggingClassifier(base_model, n_estimators=n_estimador, random_state=42)
    bagging_model.fit(X_train, y_train)

    y_pred = bagging_model.predict(X_test)

    print(f"Classification Report for {'LogisticRegression'}:\n")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("####################################################################\n")

################ Using 3 estimadors: ################
Classification Report for LogisticRegression:

              precision    recall  f1-score   support

  Antagonist       0.62      0.75      0.68        24
    Innocent       0.00      0.00      0.00         3
 Protagonist       0.36      0.31      0.33        13

    accuracy                           0.55        40
   macro avg       0.33      0.35      0.34        40
weighted avg       0.49      0.55      0.52        40

####################################################################

################ Using 4 estimadors: ################
Classification Report for LogisticRegression:

              precision    recall  f1-score   support

  Antagonist       0.60      0.75      0.67        24
    Innocent       0.00      0.00      0.00         3
 Protagonist       0.30      0.23      0.26        13

    accuracy                           0.53        40
   macro avg       0.30      0.33      0.31        40
weighted avg       0.

In [111]:

df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=2)
x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)
X_train, X_test, y_train, y_test = train_test_split(
    x_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

for n_estimador in range(3,10):
    print(f"################ Using {n_estimador} estimadors: ################")
    base_model = KNeighborsClassifier(metric= 'manhattan', n_neighbors= 3, weights='distance')
    bagging_model = BaggingClassifier(base_model, n_estimators=n_estimador, random_state=42)
    bagging_model.fit(X_train, y_train)

    y_pred = bagging_model.predict(X_test)

    print(f"Classification Report for {'KNeighborsClassifier'}:\n")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("####################################################################\n")

################ Using 3 estimadors: ################
Classification Report for KNeighborsClassifier:

              precision    recall  f1-score   support

  Antagonist       0.72      0.88      0.79        24
    Innocent       1.00      0.33      0.50         3
 Protagonist       0.60      0.46      0.52        13

    accuracy                           0.70        40
   macro avg       0.77      0.56      0.60        40
weighted avg       0.70      0.70      0.68        40

####################################################################

################ Using 4 estimadors: ################
Classification Report for KNeighborsClassifier:

              precision    recall  f1-score   support

  Antagonist       0.70      0.88      0.78        24
    Innocent       1.00      0.33      0.50         3
 Protagonist       0.67      0.46      0.55        13

    accuracy                           0.70        40
   macro avg       0.79      0.56      0.61        40
weighted avg     

### Usando Balanced Bagging Classifier

In [97]:
from imblearn.ensemble import BalancedBaggingClassifier


In [112]:
df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=2)
x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)
X_train, X_test, y_train, y_test = train_test_split(
    x_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)


for n_estimador in range(3,20):
    print(f"################ Using {n_estimador} estimadors: ################")
    base_model = KNeighborsClassifier(metric= 'manhattan', n_neighbors= 3, weights='distance')
    bagging_model = BalancedBaggingClassifier(base_model, n_estimators=n_estimador, sampling_strategy='auto', replacement=False, random_state=42)
    bagging_model.fit(X_train, y_train)

    y_pred = bagging_model.predict(X_test)

    print(f"Classification Report for {'LogisticRegression'}:\n")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("####################################################################\n")

################ Using 3 estimadors: ################
Classification Report for LogisticRegression:

              precision    recall  f1-score   support

  Antagonist       0.67      0.58      0.62        24
    Innocent       0.14      0.67      0.24         3
 Protagonist       0.20      0.08      0.11        13

    accuracy                           0.42        40
   macro avg       0.34      0.44      0.32        40
weighted avg       0.48      0.42      0.43        40

####################################################################

################ Using 4 estimadors: ################
Classification Report for LogisticRegression:

              precision    recall  f1-score   support

  Antagonist       0.70      0.58      0.64        24
    Innocent       0.13      0.67      0.22         3
 Protagonist       0.20      0.08      0.11        13

    accuracy                           0.42        40
   macro avg       0.34      0.44      0.32        40
weighted avg       0.

In [114]:
df_context = get_entity_contexts_with_offsets(df_entities,articles_map,window=5)
x_train, y_train  = vectorizar_cada_contexto(df_context, vectorizer)
X_train, X_test, y_train, y_test = train_test_split(
    x_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

for n_estimador in range(3,20):
    print(f"################ Using {n_estimador} estimadors: ################")
    base_model = RandomForestClassifier(max_depth= 10, min_samples_split= 2, random_state=42)
    bagging_model = BalancedBaggingClassifier(base_model, n_estimators=n_estimador, sampling_strategy='auto', replacement=False, random_state=42)
    bagging_model.fit(X_train, y_train)

    y_pred = bagging_model.predict(X_test)

    print(f"Classification Report for {'LogisticRegression'}:\n")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("####################################################################\n")

################ Using 3 estimadors: ################
Classification Report for LogisticRegression:

              precision    recall  f1-score   support

  Antagonist       0.74      0.83      0.78        24
    Innocent       0.12      0.33      0.18         3
 Protagonist       0.60      0.23      0.33        13

    accuracy                           0.60        40
   macro avg       0.49      0.47      0.43        40
weighted avg       0.65      0.60      0.59        40

####################################################################

################ Using 4 estimadors: ################
Classification Report for LogisticRegression:

              precision    recall  f1-score   support

  Antagonist       0.69      0.83      0.75        24
    Innocent       0.00      0.00      0.00         3
 Protagonist       0.60      0.23      0.33        13

    accuracy                           0.57        40
   macro avg       0.43      0.35      0.36        40
weighted avg       0.