In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re

from langdetect import detect
from scipy.stats import f_oneway
from collections import Counter

nltk.download("stopwords", quiet=True)

from nltk.stem.snowball import SpanishStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from sklearn.linear_model import LogisticRegression

In [28]:
df_data = pd.read_excel(r"datos/Datos_proyecto.xlsx")
df_data.head()
df = df_data.dropna(subset=["textos", "labels"])
df = df.drop_duplicates(subset=["textos", "labels"])

wpt = nltk.WordPunctTokenizer()
stop_words = set(nltk.corpus.stopwords.words("spanish"))

spanish_stemmer = SpanishStemmer()


def tokenize_text(text, stem=True):
    # Elimina caracteres especiales y espacios en blanco
    doc = re.sub(r"[^A-Za-zÁÉÍÓÚáéíóúÜüÑñ\s]", "", text, flags=re.U)  # Unicode aware
    doc = doc.lower()
    doc = doc.strip()
    # Tokenizar documento
    tokens = wpt.tokenize(doc)
    # Filtrar palabras
    filtered_tokens = [
        spanish_stemmer.stem(token) if stem else token
        for token in tokens
        if token not in stop_words
    ]
    # Recrear documento de texto
    doc = " ".join(filtered_tokens)
    return doc, filtered_tokens


normalized_df = df.copy()
normalized_df[["tokenized_text", "tokens"]] = normalized_df["textos"].apply(
    lambda x: pd.Series(tokenize_text(x, stem=False))
)


normalized_df.head()


Unnamed: 0,textos,labels,tokenized_text,tokens
0,"""Aprendizaje"" y ""educación"" se consideran sinó...",4,aprendizaje educación consideran sinónimos esc...,"[aprendizaje, educación, consideran, sinónimos..."
1,Para los niños más pequeños (bebés y niños peq...,4,niños pequeños bebés niños pequeños capacitaci...,"[niños, pequeños, bebés, niños, pequeños, capa..."
2,"Además, la formación de especialistas en medic...",3,además formación especialistas medicina genera...,"[además, formación, especialistas, medicina, g..."
3,En los países de la OCDE se tiende a pasar de ...,4,países ocde tiende pasar cursos obligatorios o...,"[países, ocde, tiende, pasar, cursos, obligato..."
4,Este grupo se centró en las personas que padec...,3,grupo centró personas padecen trastornos menta...,"[grupo, centró, personas, padecen, trastornos,..."


In [29]:
X = normalized_df["tokenized_text"]
y = normalized_df["labels"]
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)


In [30]:
def logistic_n_gram(n_grama_param, X_train, X_test, y_train, y_test):
    vectorizer_ngram = CountVectorizer(ngram_range=n_grama_param)
    X_train_ngram = vectorizer_ngram.fit_transform(X_train.fillna(""))
    X_test_ngram = vectorizer_ngram.transform(X_test.fillna(""))

    # Modelo
    log_n = LogisticRegression(max_iter=1000, random_state=42)
    log_n.fit(X_train_ngram, y_train)
    y_pred = log_n.predict(X_test_ngram)

    # Métricas
    print(f"N-grama {n_grama_param}:")
    print("F1 score:", f1_score(y_test, y_pred, average="macro"))
    print("Precision:", precision_score(y_test, y_pred, average="macro"))
    print("Recall:", recall_score(y_test, y_pred, average="macro"))
    print(classification_report(y_test, y_pred))

logistic_n_gram((1, 1), X_train_text, X_test_text, y_train, y_test)

N-grama (1, 1):
F1 score: 0.9712130330842402
Precision: 0.974286810311103
Recall: 0.9686968049298327
              precision    recall  f1-score   support

           1       0.97      0.93      0.95        82
           3       0.96      0.99      0.98       125
           4       0.99      0.99      0.99       157

    accuracy                           0.98       364
   macro avg       0.97      0.97      0.97       364
weighted avg       0.98      0.98      0.98       364



In [31]:
df2 = pd.read_excel(r"datos/Datos_aumentados_train.xlsx")
df2.shape

(2214, 2)

In [32]:
df22 = df2.drop_duplicates(subset=["textos", "labels"])

df22.shape

(2214, 2)

In [33]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import FunctionTransformer


In [4]:
df = pd.read_excel(r"datos/Datos_proyecto.xlsx")

In [35]:
"""
class WordTokenizerTransformer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        # No necesita entrenamiento pero permite compatibilidad scikit-learn
        self.stem = False
        self.wpt = nltk.WordPunctTokenizer()
        self.stop_words = set(nltk.corpus.stopwords.words("spanish"))
        self.spanish_stemmer = SpanishStemmer()
        return self

    def transform(self, X):

        df = X.copy()
        
        for index, row in df.iterrows():
            text = row['textos']
            doc = re.sub(r"[^A-Za-zÁÉÍÓÚáéíóúÜüÑñ\s]", "", text, flags=re.U)  # Unicode aware
            doc = doc.lower()
            doc = doc.strip()
            tokens = self.wpt.tokenize(doc)
            # Filtrar palabras
            filtered_tokens = [
                self.spanish_stemmer.stem(token) if self.stem else token
                for token in tokens
                if token not in self.stop_words
            ]
            # Recrear documento de texto
            doc = " ".join(filtered_tokens)
            df.loc[index, 'tokenized_text'] = doc
        return df

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

"""


  """


'\nclass WordTokenizerTransformer:\n    def __init__(self):\n        pass\n    \n    def fit(self, X, y=None):\n        # No necesita entrenamiento pero permite compatibilidad scikit-learn\n        self.stem = False\n        self.wpt = nltk.WordPunctTokenizer()\n        self.stop_words = set(nltk.corpus.stopwords.words("spanish"))\n        self.spanish_stemmer = SpanishStemmer()\n        return self\n\n    def transform(self, X):\n\n        df = X.copy()\n        \n        for index, row in df.iterrows():\n            text = row[\'textos\']\n            doc = re.sub(r"[^A-Za-zÁÉÍÓÚáéíóúÜüÑñ\\s]", "", text, flags=re.U)  # Unicode aware\n            doc = doc.lower()\n            doc = doc.strip()\n            tokens = self.wpt.tokenize(doc)\n            # Filtrar palabras\n            filtered_tokens = [\n                self.spanish_stemmer.stem(token) if self.stem else token\n                for token in tokens\n                if token not in self.stop_words\n            ]\n         

In [9]:
pd.set_option('display.max_colwidth', None)

In [10]:
X = df["textos"]
y = df["labels"]
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test_text.head

<bound method NDFrame.head of 1490                                                                                                                                                            Por lo general, se entiende que los trastornos de leves a moderados no requieren tratamientos altamente especializados administrados por psiquiatras o en entornos hospitalarios en la gran mayoría de los casos. En cambio, en la mayoría de los países de la OCDE, los médicos de atención primaria asumen un papel de liderazgo en el tratamiento de trastornos leves a moderados (ver Tabla 4.2). Cuando la provisión a nivel de atención primaria para los trastornos leves a moderados está respaldada por una buena capacitación (tanto durante la capacitación médica como como parte de la educación médica continua), por el apoyo de profesionales especialistas en atención de la salud mental y redes de apoyo, y por buenas opciones de derivación si un paciente necesita acceder a un nivel de atención más especializado,

In [None]:
import cloudpickle
from sklearn.pipeline import Pipeline
import WordTokenizer
from WordTokenizer import WordTokenizerTransformer

X = df["textos"]
y = df["labels"]
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tokenizer = WordTokenizerTransformer()
#df_token = tokenizer.fit_transform(X_train_text)

pipeline = Pipeline([
    ('tokenize', tokenizer),
    ('vectorizer', CountVectorizer(ngram_range=(1,1))),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

pipeline.fit(X_train_text, y_train)

# Evaluar
y_pred = pipeline.predict(X_test_text)

cloudpickle.register_pickle_by_value(WordTokenizer)
# Serialización del pipeline
with open('pipeline.cloudpkl', mode='wb') as file:

    cloudpickle.dump(pipeline, file)

NameError: name 'X_test' is not defined

In [6]:
pipeline.fit(X_train_text, y_train)
y_pred = pipeline.predict(X_test_text)
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score (macro):", f1_score(y_test, y_pred, average="macro"))
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred))
print("\nMatriz de confusión:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9752747252747253
F1 Score (macro): 0.9721977831629459

Reporte de clasificación:
               precision    recall  f1-score   support

           1       0.97      0.94      0.96       157
           3       0.96      0.99      0.97       255
           4       0.99      0.98      0.99       316

    accuracy                           0.98       728
   macro avg       0.97      0.97      0.97       728
weighted avg       0.98      0.98      0.98       728


Matriz de confusión:
 [[148   7   2]
 [  2 252   1]
 [  2   4 310]]
