In [None]:
# Imports
import re
import nltk
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
categories = ['rec.sport.baseball', 'sci.electronics', 'comp.graphics', 'talk.politics.misc']
newsgroups_data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

In [None]:
lemmatizer = WordNetLemmatizer()

def cleanText(text):
  # Tokenizar texto
  tokens = gensim.utils.simple_preprocess(text, deacc=True)

  # Eliminar stopwords y palabras cortas
  processed_tokens = []
  for token in tokens:
    if token not in stop_words and len(token) > 2:
      # Lemantizar
      processed_tokens.append(lemmatizer.lemmatize(token))
  return processed_tokens

# Limpieza de noticias
processed_docs = [cleanText(doc) for doc in newsgroups_data.data]

In [None]:
# Creacion de diccionario y BoW
dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
num_topics = len(categories)

# Modelo LDA
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=4,
                     random_state=43,
                     chunksize=100)

In [None]:
# Temas identificados por el modelo
topics = lda_model.print_topics(num_words=10)
for topic in topics:
  print(topic)

(0, '0.012*"one" + 0.012*"use" + 0.011*"line" + 0.009*"system" + 0.009*"would" + 0.009*"work" + 0.009*"problem" + 0.008*"circuit" + 0.007*"like" + 0.007*"need"')
(1, '0.020*"year" + 0.012*"game" + 0.010*"run" + 0.010*"good" + 0.009*"one" + 0.009*"would" + 0.009*"last" + 0.009*"better" + 0.008*"think" + 0.008*"player"')
(2, '0.012*"people" + 0.012*"would" + 0.011*"think" + 0.009*"government" + 0.009*"say" + 0.009*"president" + 0.009*"going" + 0.009*"state" + 0.008*"make" + 0.007*"american"')
(3, '0.014*"graphic" + 0.014*"edu" + 0.012*"anyone" + 0.011*"thanks" + 0.011*"point" + 0.010*"file" + 0.010*"know" + 0.009*"program" + 0.009*"image" + 0.009*"would"')


In [None]:
# Diccionario basado en el paso anterior
topic_dic = {
    0: 'sci.electronics',
    1: 'rec.sport.baseball',
    2: 'talk.politics.misc',
    3: 'comp.graphics'
}

In [None]:
def predict_topic(new_text):
  # Limpierza y BoW
  processed_new_text = cleanText(new_text)
  new_text_bow = dictionary.doc2bow(processed_new_text)
  # Obtencion del topico mas probable y su confianza
  topic_distribution = lda_model.get_document_topics(new_text_bow)
  most_likely_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0]
  topic_id, confidence = most_likely_topic
  predicted_topic_name = topic_dic[topic_id]
  return predicted_topic_name, confidence

In [None]:
# Predecir el tópico de 10 noticias
for i in range(10):
  new_unseen_news = newsgroups_data.data[i]
  predicted_topic, confidence = predict_topic(new_unseen_news)

  print(f"Noticia:\n'{new_unseen_news[:200]}...'")
  print("-" * 30)
  print(f"Tópico Predicho: '{predicted_topic}'")
  print(f" confiança: {confidence:.2%}")

Noticia:
'




Um, what?  Eddie Murray was a superb first baseman for a *long* time.

Winfield as produced consistently for almsot 20 years, and excellently
on several occasions.  

Dave Kingman's *best* year w...'
------------------------------
Tópico Predicho: 'rec.sport.baseball'
 confiança: 92.73%
Noticia:
'What hardware do plan to run on?  Workstation or PC?  Cost level?
Run-time licensing needs?

Bob...'
------------------------------
Tópico Predicho: 'sci.electronics'
 confiança: 65.92%
Noticia:
'
This is why I asked to be 'enlightened'.  You are making claims about what
'is' or 'is not' part of this program.  But if the "block grants" go to states
and cities, the mayors list is VERY relivent....'
------------------------------
Tópico Predicho: 'talk.politics.misc'
 confiança: 67.83%
Noticia:
'

   An addition to anti-discrimination laws which includes homo and bisexuality
? One would assume it would be because politicians were listening to the people
coming up with rational arg