In [12]:
class LDAmodellerSKL:
  def __init__(self, df, vectorizer, stops=stops, min_df=2, max_df=0.95, num_topics=5, lda_random_state=12345, max_iter=100, topic_word_prior=0.6):
    self.df = df
    
    if vectorizer == 'count':
      self.vectorizer = CountVectorizer(stop_words = list(stops),
                                        min_df = min_df,
                                        max_df = max_df)
      
    elif vectorizer == 'tfidf':
      self.vectorizer = TfidfVectorizer(stop_words = list(stops), 
                                max_df = max_df, 
                                min_df = min_df, 
                                use_idf = True,
                                norm = None)
      
    else:
      raise ValueError('The vectorizer value can be either "count" or "tfidf"')
    
    
    
    self.lda = LatentDirichletAllocation(n_components = num_topics,
                                         random_state = lda_random_state,
                                         max_iter = max_iter,
                                         topic_word_prior = topic_word_prior)
    
    self.vectorized_sentences = None
    self.new_vectorized_sentences = None
  
  def vectorize(self):
    self.new_vectorized_sentences = self.vectorizer.fit_transform(self.df['text'])
    return self.new_vectorized_sentences
    
  
  def vectorize_fit(self):
    self.vectorized_sentences = self.vectorizer.fit_transform(self.df['text'])
    self.lda.fit(self.vectorized_sentences)
    return self.lda
  
  def transform(self, sentences=None):
    if sentences is None:
      vectors = self.lda.transform(self.vectorized_sentences)
    else:
      vectors = self.lda.transform(sentences)
    return vectors

In [1]:
class LDAmodellerGensim:
  def __init__(self, df, vectorizer, stops=stops, min_df=2, max_df=0.95, num_topics=5, lda_random_state=12345, 
               passes=200, iterations=200, per_words_topic=False, alpha=8, eta=0.9, gamma_threshold=8):
    
    self.df = df
    self.stops = stops
    self.min_df = min_df
    self.max_df = max_df
    self.num_topics = num_topics
    self.lda_random_state = lda_random_state
    self.passes = passes
    self.iterations = iterations
    self.per_words_topic = per_words_topic
    self.alpha = alpha
    self.eta = eta
    self.gamma_threshold = gamma_threshold
    
    if vectorizer == 'count':
      self.vectorizer = CountVectorizer(stop_words = list(stops),
                                        min_df = min_df,
                                        max_df = max_df)
      
    elif vectorizer == 'tfidf':
      self.vectorizer = TfidfVectorizer(stop_words = list(stops), 
                                max_df = max_df, 
                                min_df = min_df, 
                                use_idf = True,
                                norm = None)
      
    else:
      raise ValueError('The vectorizer value can be either "count" or "tfidf"')
    
    self.vectorizer.fit(df['text'])
    self.vocab = self.vectorizer.get_feature_names_out()
    
    # Create document-term matrix
  def create_document_term_matrix(self, column_name='text', stops=stops, min_df=2, max_df=0.95):
    data = self.vectorizer.fit_transform(self.df[column_name])
    df_dtm = pd.DataFrame(data.toarray(), columns = self.vectorizer.get_feature_names_out())
    df_dtm.index = self.df.index
    return df_dtm
    
    
  def make_bigrams(self, texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

  def make_trigrams(self, texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
  
    
  def create_ngrams(self):
    words = words = [nltk.word_tokenize(sentence) for sentence in self.df['text']]
    bigram = gensim.models.Phrases(words, min_count = 5, threshold = 100)
    trigram = gensim.models.Phrases(bigram[words], threshold = 100)
  
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
   
    word_bigrams = self.make_bigrams(words, bigram_mod = bigram_mod)
    word_trigrams = self.make_trigrams(words, bigram_mod = bigram_mod, trigram_mod = trigram_mod)
      
    self.words = words
    self.word_bigrams = word_bigrams
    self.word_trigrams = word_trigrams
      
    
  def id2word_corpus(self):
    self.id2word = corpora.Dictionary(self.words)
    # OPTIONAL STEP!.
    # Filter out tokens that appear in less than 15 documents, more than 0.5 documents (fraction of total corpus size, not absolute number) and keep only the first 100000 most frequent tokens.
    # id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    self.corpus = [self.id2word.doc2bow(word) for word in self.words]

     
  def train_lda(self):
    self.create_document_term_matrix()
    self.create_ngrams()
    self.id2word_corpus()
    
    self.lda_model = gensim.models.LdaMulticore(corpus = self.corpus, 
                                           id2word = self.id2word, 
                                           num_topics = self.num_topics, 
                                           passes = self.passes,
                                           iterations = self.iterations,
                                           per_word_topics = self.per_words_topic,
                                           random_state = self.lda_random_state,
                                           alpha = self.alpha,
                                           eta = self.eta,
                                           gamma_threshold = self.gamma_threshold
                                          )
      
    pprint(self.lda_model.print_topics())
      
      
      

NameError: name 'stops' is not defined

In [None]:
class LDAmodeller:
  def __init__(self, df, vectorizer, lib, stops=stops, min_df=2, max_df=0.95, 
               num_topics=5, lda_random_state=12345, max_iter=100, topic_word_prior=0.6,
               passes=200, iterations=200, per_words_topic=False, alpha=8, eta=0.9, 
               gamma_threshold=8, verbose=True):
  
    self.df = df
    self.lib = lib
    self.stops = stops
    self.min_df = min_df
    self.max_df = max_df
    self.num_topics = num_topics
    self.lda_random_state = lda_random_state
    self.max_iter = max_iter
    self.topic_word_prior = topic_word_prior
    self.passes = passes
    self.iterations = iterations
    self.per_words_topic = per_words_topic
    self.alpha = alpha
    self.eta = eta
    self.gamma_threshold = gamma_threshold
    self.verbose = verbose
  
    if vectorizer == 'count':
      self.vectorizer = CountVectorizer(stop_words = list(stops),
                                        min_df = min_df,
                                        max_df = max_df)
      
    elif vectorizer == 'tfidf':
      self.vectorizer = TfidfVectorizer(stop_words = list(stops), 
                                max_df = max_df, 
                                min_df = min_df, 
                                use_idf = True,
                                norm = None)
      
    else:
      raise ValueError('The vectorizer value can be either "count" or "tfidf"')
    
    
    self.vectorizer.fit(df['text'])
    self.vocab = self.vectorizer.get_feature_names_out()
    self.vectorized_sentences = self.vectorizer.fit_transform(self.df['text'])
      
      
  def create_document_term_matrix(self, column_name='text', stops=stops, min_df=2, max_df=0.95):
    data = self.vectorizer.fit_transform(self.df[column_name])
    df_dtm = pd.DataFrame(data.toarray(), columns = self.vectorizer.get_feature_names_out())
    df_dtm.index = self.df.index
    return df_dtm
    
    
  def make_bigrams(self, texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

  
  def make_trigrams(self, texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
  
  
  def create_ngrams(self):
    words = words = [nltk.word_tokenize(sentence) for sentence in self.df['text']]
    bigram = gensim.models.Phrases(words, min_count = 5, threshold = 100)
    trigram = gensim.models.Phrases(bigram[words], threshold = 100)

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    word_bigrams = self.make_bigrams(words, bigram_mod = bigram_mod)
    word_trigrams = self.make_trigrams(words, bigram_mod = bigram_mod, trigram_mod = trigram_mod)

    self.words = words
    self.word_bigrams = word_bigrams
    self.word_trigrams = word_trigrams
    
    
  def id2word_corpus(self):
    self.id2word = corpora.Dictionary(self.words)
    # OPTIONAL STEP!.
    # Filter out tokens that appear in less than 15 documents, more than 0.5 documents (fraction of total corpus size, not absolute number) and keep only the first 100000 most frequent tokens.
    # id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    self.corpus = [self.id2word.doc2bow(word) for word in self.words]
    
    
    
  def train_lda(self):
    if self.lib == 'skl':
      self.lda_model = LatentDirichletAllocation(n_components = self.num_topics,
                                           random_state = self.lda_random_state,
                                           max_iter = self.max_iter,
                                           topic_word_prior = self.topic_word_prior)
        
      self.lda_model.fit(self.vectorized_sentences)
      
      if self.verbose:
        plot_top_words(self.lda_model, self.vocab)
      
    elif self.lib == 'gensim':
      self.create_document_term_matrix()
      self.create_ngrams()
      self.id2word_corpus()
      
      self.lda_model = gensim.models.LdaMulticore(corpus = self.corpus, 
                                           id2word = self.id2word, 
                                           num_topics = self.num_topics, 
                                           passes = self.passes,
                                           iterations = self.iterations,
                                           per_word_topics = self.per_words_topic,
                                           random_state = self.lda_random_state,
                                           alpha = self.alpha,
                                           eta = self.eta,
                                           gamma_threshold = self.gamma_threshold
                                          )
      
      if self.verbose:
        pprint(self.lda_model.print_topics())
      
    else:
      raise ValueError('Lib parameter can be either "skl" or "gensim"')