In [2]:
import joblib
import re
import unicodedata
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import FunctionTransformer
import nltk

# Descargar recursos NLTK necesarios
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Cargar el dataset
df_unfpa = pd.read_excel('./ODScat_345.xlsx', sheet_name='Datos')

# Definir las stop words y el lematizador
stop_words = set(stopwords.words('spanish'))
lemmatizer = WordNetLemmatizer()

# Función para eliminar acentos
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

# Función de preprocesamiento de texto
def preprocess_text(text):
    text = text.lower()
    text = remove_accents(text)
    text = re.sub(r'[^\x00-\x7f]',r' ', text)  # Eliminar caracteres no ASCII
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Crear un transformer para usar en el pipeline
def preprocess_text_data(text_data):
    return [preprocess_text(text) for text in text_data]

# Aplicar el preprocesamiento al dataframe
df_unfpa['Textos_preprocesados'] = df_unfpa['Textos_espanol'].apply(preprocess_text)

# Separar los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df_unfpa['Textos_preprocesados'], df_unfpa['sdg'], test_size=0.2, random_state=42)

# Definir el pipeline con preprocesamiento y SVM
pipeline = Pipeline([
    ('preprocessing', FunctionTransformer(preprocess_text_data, validate=False)),
    ('vectorizer', TfidfVectorizer(max_features=5000)),
    ('classifier', SVC(probability=True))  # Usar probabilidad para obtener estimaciones de confianza
])

# Entrenar el pipeline
pipeline.fit(X_train, y_train)

# Exportar el pipeline
joblib.dump(pipeline, 'unfpa_text_classification_pipeline.pkl')
print("Pipeline guardado exitosamente.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Pipeline guardado exitosamente.
