# Demostración de clasificación y tópicos

Ejemplo interactivo del flujo de `topic_modeling.py`.

In [None]:
import pandas as pd
from topic_modeling import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation


In [None]:
data = [
    {"speaker": "interno", "text": "Hola, ¿en qué puedo ayudarte?", "topics": []},
    {"speaker": "externo", "text": "¿Cuál es el precio del plan básico?", "topics": ["precio"]},
    {"speaker": "interno", "text": "El plan básico cuesta 10 dólares.", "topics": []},
    {"speaker": "externo", "text": "Gracias, ¿y si quiero soporte premium?", "topics": ["soporte", "precio"]},
    {"speaker": "externo", "text": "El servicio fue muy malo", "topics": ["reclamo"]},
    {"speaker": "externo", "text": "Quiero saber la factura pendiente", "topics": ["facturacion"]},
    {"speaker": "externo", "text": "Necesito hablar con ventas", "topics": ["ventas"]},
    {"speaker": "externo", "text": "¿Cuál es el más económico?", "topics": ["ventas"]}
]

df = pd.DataFrame(data)
client_msgs = df[df['speaker'] == 'externo']
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(client_msgs['topics'])

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])
pipeline.fit(client_msgs['text'], y)
pipeline.predict(client_msgs['text'])


In [None]:
cv = CountVectorizer(tokenizer=tokenize)
X = cv.fit_transform(client_msgs['text'])
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X)
feature_names = cv.get_feature_names_out()
n_top_words = 5
for idx, topic in enumerate(lda.components_):
    terms = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print('Tópico {}: {}'.format(idx, ' '.join(terms)))
