# Topic modelling
## Unsupervised learning of topics in a text
### using Latent Dirchlet Allocation (via sklearn)
Topic modelling can be thought of as dimensionality reduction:  
Documents are represented as sets of topics  
Each topic has a weight

In [4]:
!pip install scikit-learn==0.22.2.post1
!pip install nltk==3.6.2
!pip install pandas==1.2.4
import re
import pandas as pd
import sklearn
import csv
import nltk
nltk.download('punkt')
import string
from nltk.stem.snowball import SnowballStemmer

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


[nltk_data] Downloading package punkt to /home/mjams001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# use CountVectorizer to turn the docs into vectors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [6]:
# create the stemmer
stemmer = SnowballStemmer('english')

In [7]:
# helper functions
stopwords_file_path = 'en_stopwords.csv'
def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', quotechar='"')
        data_read = [row for row in reader]
    return data_read

def get_stopwords(path=stopwords_file_path):
    stopwords = read_in_csv(path)
    stopwords = [word[0] for word in stopwords]
    stemmed_stopwords = [stemmer.stem(word) for word in stopwords]
    stopwords = stopwords + stemmed_stopwords
    return stopwords

def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_tokens = [t for t in tokens if t not in stopwords and t not in string.punctuation and re.search('[a-zA-Z]', t)]
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

### We’ll use a public dataset from the BBC comprised of 2,225 articles  
Each labeled under one of 5 categories: business, entertainment, politics, sport or tech

In [8]:
# read in our data
stopwords_file_path = "en_stopwords.csv"
stopwords = get_stopwords(stopwords_file_path)
bbc_dataset = "en_stopwords.csv"

In [9]:
# turn the documents into vectors
def create_count_vectorizer(documents):
    count_vectorizer = CountVectorizer(stop_words=stopwords, tokenizer=tokenize_and_stem, max_features=1500)
    data = count_vectorizer.fit_transform(documents)
    return (count_vectorizer, data)

In [10]:
# remove unwanted characters (keep just words and spaces)
def clean_data(df):
    df['description'] = df['description'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    df['description'] = df['description'].apply(lambda x: re.sub(r'\d', '', x))
    return df

In [11]:
# create the LDA model (note that usually num_topics is unknown)
def create_and_fit_lda(data, num_topics):
    lda = LDA(n_components=num_topics, n_jobs=-1)
    lda.fit(data)
    return lda

In [12]:
# identify & print the most common topic words
def get_most_common_words_for_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    word_dict = {}
    for topic_index, topic in enumerate(model.components_):
        this_topic_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        word_dict[topic_index] = this_topic_words
    return word_dict

def print_topic_words(word_dict):
    for key in word_dict.keys():
        print(f"Topic {key}")
        print("\t", word_dict[key])

In [13]:
# read in the data, clean it, get text
df = pd.read_csv(bbc_dataset)
df = clean_data(df)
documents = df['description']

# set number of topics (note that usually this is unknown)
number_topics = 5

In [14]:
df

Unnamed: 0,file,name,size,source_url,date,description
0,_none.txt,_None_,0,en/_none.txt,,No stop word removal
1,sphinx.txt,Sphinx,0,http://sphinxsearch.com/docs/current.html#conf...,,Sphinx is an open source search server Top go...
2,ebscohost_medline_cinahl.txt,EBSCOhost,24,https://help.ebsco.com/interfaces/CINAHL_MEDLI...,,The stop words used in EBSCOhost medical datab...
3,corenlp_hardcoded.txt,CoreNLP (Hardcoded),28,https://github.com/stanfordnlp/CoreNLP/blob/ma...,,Hardcoded in src edu stanford nlp coref data W...
4,ranksnl_oldgoogle.txt,Ranks NL (Google),32,http://www.ranks.nl/stopwords,,The short stopwords list below is based on wha...
...,...,...,...,...,...,...
63,ranksnl_large.txt,Ranks NL (Large),667,http://www.ranks.nl/stopwords,,A very long list from ranks nl
64,tonybsk_6.txt,tonybsk_6.txt,671,https://github.com/igorbrigadir/stopwords/blob...,,Unknown origin I lost the reference
65,terrier.txt,Terrier,733,http://terrier.org/docs/v4.1/javadoc/org/terri...,,Terrier Retrieval Engine Stopword list to loa...
66,atire_puurula.txt,ATIRE (Puurula),988,http://www.atire.org/hg/atire/file/tip/source/...,,Included in ATIRE See Paper http www aclwe...


### Step 1: Extract one of the categories  
Select a particular category from the dataframe, e.g. tech

In [15]:
documents

0                                 No stop word removal 
1     Sphinx is an open source search server  Top go...
2     The stop words used in EBSCOhost medical datab...
3     Hardcoded in src edu stanford nlp coref data W...
4     The short stopwords list below is based on wha...
                            ...                        
63                       A very long list from ranks nl
64               Unknown origin   I lost the reference 
65    Terrier Retrieval Engine  Stopword list to loa...
66    Included in ATIRE See  Paper  http   www aclwe...
67    List of common stop words in various languages...
Name: description, Length: 68, dtype: object

In [16]:
#df = df.loc[df['category']=='tech']

In [17]:
# create vectorizer & model
(vectorizer, data) = create_count_vectorizer(documents)
lda = create_and_fit_lda(data, number_topics)

In [18]:
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

### Step 2: Inspect the results  
Are they coherent? Do they seem to be different topics?

In [19]:
# inspect the contents of the topics
topic_words = get_most_common_words_for_topics(lda, vectorizer, 10)
print_topic_words(topic_words)

Topic 0
	 ['word', 'the', 'and', 'are', 'stopword', 'search', 'in', 'stop', 'of', 'a']
Topic 1
	 ['list', 'to', 'smart', 'also', 'same', 'as', 'and', 'bow', 'an', 'edu']
Topic 2
	 ['for', 'a', 'list', 'paper', 'in', 'www', 'use', 'http', 'includ', 'time']
Topic 3
	 ['stopword', 'list', 'snowbal', 'http', 'sphinx', 'postgresql', 'for', 'english', 'from', 'to']
Topic 4
	 ['the', 'in', 'stopword', 'list', 'use', 'and', 'word', 'is', 'of', 'a']


In [24]:
# Test
def test_new_example(lda, vect, example):
    vectorized = vect.transform([example])
    topic = lda.transform(vectorized)
    print(topic)
    return topic

In [25]:
example = """ what I would encourage you to do.I have actually run this several times and seen much more convincingoutputs than this, so it does vary.I definitely would encourage you to try this for yourself.Rerun it if you think the topics don't make any senseand try it on a new example.Normally, my blog, it's pretty clear.It comes out as tech every time.That's actually the first time it came out as something that,in this characterization, at least, it's a little less clear,somewhere between business and tech.Just one final thing, of course, bear in mind that normallywhen you do topic modeling, you don't know the topics in advance.Here, we did.We used it as a sanity check, but normally, you don't.The whole point is that you'll just see an output like this,and you'll use that to get some insight into the content that we have there.You can think of it as a kind of dimensionality reduction.Also, feel free, of course, to try different values of N.We knew here that there was five topics, but you could try itwith a bigger number and see what happens or you could try it on a sub-corpus.You could try, for example, on the sports documents to seeif they would divide up according to different sport typesor something like that.There it is, topic modeling, very useful technique for dimensionality """

In [26]:
test_new_example(lda, vectorizer, example)

[[0.34139994 0.0636246  0.02555062 0.03400067 0.53542417]]


array([[0.34139994, 0.0636246 , 0.02555062, 0.03400067, 0.53542417]])

### Step 3: Try a different category  
Select a different category from the dataframe, e.g. sport

### Step 4: Inspect the results  
Are they coherent? Do they seem to be different topics?

### Step 5: Try different values of N  
Return to Step 1 and repeat the process with a different number of topics

In [29]:
## Credits to University of London for this Notebook code