# Latent Dirichlet Allocation (LDA)

🎯 The goal of this challenge is to find topics within a corpus of reviews with the **LDA** algorithm (Unsupervised Learning in NLP)

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def load_data(percentage_of_sentences=None):
    train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)

    train_sentences, y_train = tfds.as_numpy(train_data)
    test_sentences, y_test = tfds.as_numpy(test_data)
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
  
        len_test = int(percentage_of_sentences/100*len(test_sentences))
        test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]
    
    X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
    X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]
    
    X_train = [' '.join(_) for _ in X_train]
    X_test = [' '.join(_) for _ in X_test]
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)

In [31]:
import pandas as pd

data = pd.DataFrame(X_train)
data.columns = ['text']
data.head()

Unnamed: 0,text
0,this was an absolutely terrible movie don't be...
1,i have been known to fall asleep during films ...
2,mann photographs the alberta rocky mountains i...
3,this is the kind of film for a snowy sunday af...
4,as others have mentioned all the women that go...


In [32]:
data.shape

(2500, 1)

## (1) Preprocessing 

Clean the text with **NLTK**

In [33]:
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocessing(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercasing 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## removing numbers
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## removing punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenizing 
    stop_words = set(stopwords.words('english')) ## defining stopwords
    tokenized_sentence_cleaned = [w for w in tokenized_sentence 
                                  if not w in stop_words] ## remove stopwords
    # 1 - Lemmatizing the verbs
    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in tokenized_sentence_cleaned]
    # 2 - Lemmatizing the nouns
    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
                for word in verb_lemmatized]
    cleaned_sentence= ' '.join(w for w in noun_lemmatized)
    return cleaned_sentence


In [34]:
data['clean_text']=data.text.apply(preprocessing)
data

Unnamed: 0,text,clean_text
0,this was an absolutely terrible movie don't be...,absolutely terrible movie dont lure christophe...
1,i have been known to fall asleep during films ...,know fall asleep film usually due combination ...
2,mann photographs the alberta rocky mountains i...,mann photograph alberta rocky mountain superb ...
3,this is the kind of film for a snowy sunday af...,kind film snowy sunday afternoon rest world go...
4,as others have mentioned all the women that go...,others mention woman go nude film mostly absol...
...,...,...
2495,bad acting bad writing this was a poorly writt...,bad act bad write poorly write film bad potent...
2496,when i saw that imdb users rated this movie th...,saw imdb user rat movie bottom movie think har...
2497,there is something about pet sematary that i n...,something pet sematary never felt anywhere els...
2498,what the hell of a d movie was that bad acting...,hell movie bad act bad special effect worst di...


## (2) Latent Dirichlet Allocation model

Train a LDA model to extract potential topics

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer().fit(data.clean_text)
vectorized_text = vectorizer.transform(data.clean_text)
vectorized_text = pd.DataFrame(vectorized_text.toarray(), columns=vectorizer.get_feature_names())
vectorized_text

Unnamed: 0,aaaaah,aaah,aag,aames,aapke,aardman,aargh,aaron,aatish,ab,...,zyuranger,zz,zzzzzzzzzzzz,zzzzzzzzzzzzz,zé,ème,élan,émigré,était,étc
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Instantiating the LDA 
n_components = 5
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 100)

# Fitting the LDA on the vectorized documents
lda_model.fit(vectorized_text)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=100,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [39]:
text_topics=lda_model.transform(vectorized_text)
pd.DataFrame(text_topics)

Unnamed: 0,0,1,2,3,4
0,0.025802,0.025802,0.896790,0.025802,0.025803
1,0.027351,0.027351,0.890598,0.027351,0.027351
2,0.024156,0.024156,0.903376,0.024156,0.024156
3,0.027315,0.027315,0.890742,0.027315,0.027315
4,0.032707,0.032707,0.869172,0.032707,0.032707
...,...,...,...,...,...
2495,0.034788,0.034788,0.860846,0.034788,0.034788
2496,0.021930,0.021930,0.912279,0.021930,0.021930
2497,0.024466,0.024466,0.902137,0.024466,0.024466
2498,0.029902,0.029902,0.880393,0.029902,0.029902


##  (3) Visualize potential topics

In [44]:
def print_topics(model, vectorizer):
    topic_mixture = pd.DataFrame(lda_model.components_,
                                 columns = vectorizer.get_feature_names())
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        topic_df = topic_mixture.iloc[idx].sort_values(ascending = False).head(3)
        print(round(topic_df,3))
        print("-"*10)

Print the topics extracted by your LDA.

In [45]:
print(print_topics(lda_model,vectorizer))

Topic 0:
scanner    1.036
cognac     0.981
hou        0.945
Name: 0, dtype: float64
----------
Topic 1:
kamal     1.142
salman    1.073
rani      0.979
Name: 1, dtype: float64
----------
Topic 2:
br       240.958
movie    139.708
film     120.632
Name: 2, dtype: float64
----------
Topic 3:
blackadder    0.827
ollie         0.804
cruella       0.777
Name: 3, dtype: float64
----------
Topic 4:
pokémon     0.900
kangwon     0.645
province    0.645
Name: 4, dtype: float64
----------
None


## (4) Predict the document-topic mixture of a new text

Now that the LDA model is fitted, we can use it to predict the topics of a new text.

1. Vectorize the example
2. Use the LDA on the vectorized example to predict the topics

In [46]:
example = ["My team performed poorly last season. Their best player was out injured and only played one game"]

In [47]:
clean_example=preprocessing(example[0])
vectorized_example=vectorizer.transform([clean_example])
lda_model.transform(vectorized_example)

array([[0.04840481, 0.04840479, 0.80638069, 0.0484048 , 0.04840491]])