<a href="https://colab.research.google.com/github/EnsarIshakoglu/NLP/blob/topic-modelling/NLP_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Source: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
# Run in python console
import nltk; nltk.download('stopwords')
import zipfile
import os

# Run in terminal or command prompt
!python3 -m spacy download en

# SKlearn
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_extraction.text import CountVectorizer
# from imblearn.over_sampling import SMOTE

# Data processing
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np

# Stackapi to fetch stackoverflow api
!pip install stackapi
from stackapi import StackAPI

from pprint import pprint

# Gensim
!pip install gensim==3.8.3
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.6 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


### Mount colab drive to google drive

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Fetch data

In [24]:
# Get the data from stackoverflow sorted by votes
def fetch_data():
  if not exists('/content/NLP-question-data'):
    !git clone https://github.com/EnsarIshakoglu/NLP-question-data.git
  
  data = pd.read_csv('/content/NLP-question-data/questions.csv')
  
  return data

## Remove useless data

In [25]:
def clean_data(df):  
  df = df[['tags', 'body']]

  # Strip html tags with regex:
  df['body'] = df['body'].str.replace(r'<[^<>]*>', '', regex=True)

  # Get first tag for multi-class classification
  row_count = df.shape[0]

  # for i in range(row_count):
  #   df['tags'].iloc[i] = df['tags'].iloc[i][0]
  return df

### Create folder and file from df, unmout drive after

In [26]:
from os.path import exists

!mkdir stackoverflow

if not exists('/content/stackoverflow/questions.csv') or not exists('/content/NLP-question-data'):
  df = fetch_data()[['tags', 'body']]
  df = clean_data(df)
  # df.to_csv('/content/stackoverflow/questions.csv')
  print("Fetched data from stackoverflow, removed the useless data and saved it in stackoverflow/questions.csv")

drive.flush_and_unmount()
df

mkdir: cannot create directory ‘stackoverflow’: File exists
Fetched data from stackoverflow, removed the useless data and saved it in stackoverflow/questions.csv


AttributeError: ignored

                                                    tags                                               body
0      ['java', 'c++', 'performance', 'cpu-architectu...  Here is a piece of C++ code that shows some ve...
1       ['git', 'version-control', 'git-commit', 'undo']  I accidentally committed the wrong files to Gi...
2      ['git', 'version-control', 'git-branch', 'git-...  I want to delete a branch both locally and rem...
3      ['git', 'version-control', 'git-pull', 'git-fe...  What are the differences between git pull and ...
4      ['python', 'iterator', 'generator', 'yield', '...  What is the use of the yield keyword in Python...
...                                                  ...                                                ...
17495           ['python', 'pandas', 'dataframe', 'nan']  I have a Pandas Dataframe as below:\n      itm...
17496        ['java', 'datetime', 'java-8', 'java-time']  What is the best way to convert a java.util.Da...
17497                       

### Load file from drive

In [17]:
print(f"There are {len(df['body'].unique())} rows in the dataset.")

There are 3900 rows in the dataset.


In [18]:
df = df[~df['body'].duplicated()]
print(f"There are {len(df)} rows in the deduplicated dataset.")

There are 3900 rows in the deduplicated dataset.


## Topic modeling

In [19]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## Prepare data

In [20]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df['body']))

print(data_words[:1])

[['here', 'is', 'piece', 'of', 'code', 'that', 'shows', 'some', 'very', 'peculiar', 'behavior', 'for', 'some', 'strange', 'reason', 'sorting', 'the', 'data', 'before', 'the', 'timed', 'region', 'miraculously', 'makes', 'the', 'loop', 'almost', 'six', 'times', 'faster', 'include', 'lt', 'algorithm', 'gt', 'include', 'lt', 'ctime', 'gt', 'include', 'lt', 'iostream', 'gt', 'int', 'main', 'generate', 'data', 'const', 'unsigned', 'arraysize', 'int', 'data', 'arraysize', 'for', 'unsigned', 'lt', 'arraysize', 'data', 'std', 'rand', 'with', 'this', 'the', 'next', 'loop', 'runs', 'faster', 'std', 'sort', 'data', 'data', 'arraysize', 'test', 'clock_t', 'start', 'clock', 'long', 'long', 'sum', 'for', 'unsigned', 'lt', 'for', 'unsigned', 'lt', 'arraysize', 'primary', 'loop', 'if', 'data', 'gt', 'sum', 'data', 'double', 'elapsedtime', 'static_cast', 'lt', 'double', 'gt', 'clock', 'start', 'clocks_per_sec', 'std', 'cout', 'lt', 'lt', 'elapsedtime', 'lt', 'lt', 'std', 'cout', 'lt', 'lt', 'quot', 'sum

In [21]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['here', 'is', 'piece', 'of', 'code', 'that', 'shows', 'some', 'very', 'peculiar', 'behavior', 'for', 'some', 'strange', 'reason', 'sorting', 'the', 'data', 'before', 'the', 'timed', 'region', 'miraculously', 'makes', 'the', 'loop', 'almost', 'six', 'times', 'faster', 'include', 'lt', 'algorithm', 'gt', 'include', 'lt', 'ctime', 'gt', 'include', 'lt', 'iostream', 'gt', 'int', 'main', 'generate', 'data', 'const', 'unsigned', 'arraysize', 'int', 'data', 'arraysize', 'for', 'unsigned', 'lt', 'arraysize', 'data', 'std', 'rand', 'with', 'this', 'the', 'next', 'loop', 'runs', 'faster', 'std', 'sort', 'data', 'data', 'arraysize', 'test', 'clock_t', 'start', 'clock', 'long', 'long', 'sum', 'for', 'unsigned', 'lt', 'for', 'unsigned', 'lt', 'arraysize', 'primary', 'loop', 'if', 'data', 'gt', 'sum', 'data', 'double', 'elapsedtime', 'static_cast', 'lt', 'double', 'gt', 'clock', 'start', 'clocks_per_sec', 'std_cout', 'lt', 'lt', 'elapsedtime', 'lt', 'lt', 'std_cout', 'lt', 'lt', 'quot', 'sum', 'quo

In [22]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [27]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'ADV'])

print(data_lemmatized[:1])

[['piece', 'code', 'peculiar', 'behavior', 'strange', 'reason', 'datum', 'region', 'miraculously', 'loop', 'almost', 'time', 'faster', 'main', 'generate', 'datum', 'const', 'datum', 'loop', 'fast', 'datum', 'datum', 'test', 'clock', 'long', 'long', 'sum', 'datum', 'double', 'elapsedtime', 'lt', 'double', 'clock', 'start', 'datum', 'second', 'code', 'second', 'time', 'pass', 'array', 'actually', 'worth', 'calculate', 'unknown', 'array', 'initially', 'compiler', 'random', 'public', 'class', 'main', 'public', 'static', 'void', 'main', 'string', 'arg', 'generate', 'datum', 'int', 'next', 'fast', 'test', 'long', 'start', 'system', 'nanotime', 'long', 'sum', 'lt', 'system', 'println', 'system', 'start', 'system', 'similar', 'less', 'extreme', 'result', 'first', 'thought', 'data', 'silly', 'processing', 'array', 'faster', 'processing', 'code', 'independent', 'term', 'matter', 'followup', 'amp', 'effect', 'different', 'late', 'compiler', 'option', 'processing', 'array', 'speed', 'processing', 

In [28]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 5), (5, 1), (6, 1), (7, 1), (8, 2), (9, 4), (10, 2), (11, 1), (12, 1), (13, 8), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 2), (20, 2), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 4), (31, 2), (32, 2), (33, 3), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 4), (46, 2), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 1), (54, 1), (55, 1), (56, 3), (57, 1), (58, 1), (59, 1), (60, 2), (61, 4), (62, 1), (63, 2), (64, 1), (65, 2), (66, 1), (67, 1), (68, 1)]]


In [29]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('actually', 1),
  ('almost', 1),
  ('amp', 1),
  ('arg', 1),
  ('array', 5),
  ('behavior', 1),
  ('calculate', 1),
  ('class', 1),
  ('clock', 2),
  ('code', 4),
  ('compiler', 2),
  ('const', 1),
  ('data', 1),
  ('datum', 8),
  ('different', 1),
  ('double', 2),
  ('effect', 1),
  ('elapsedtime', 1),
  ('extreme', 1),
  ('fast', 2),
  ('faster', 2),
  ('first', 1),
  ('flag', 1),
  ('followup', 1),
  ('generate', 2),
  ('independent', 1),
  ('initially', 1),
  ('int', 1),
  ('late', 1),
  ('less', 1),
  ('long', 4),
  ('loop', 2),
  ('lt', 2),
  ('main', 3),
  ('matter', 1),
  ('miraculously', 1),
  ('modern', 1),
  ('nanotime', 1),
  ('next', 1),
  ('optimization', 1),
  ('option', 1),
  ('pass', 1),
  ('peculiar', 1),
  ('piece', 1),
  ('println', 1),
  ('processing', 4),
  ('public', 2),
  ('random', 1),
  ('reason', 1),
  ('region', 1),
  ('result', 1),
  ('second', 2),
  ('silly', 1),
  ('similar', 1),
  ('slower', 1),
  ('speed', 1),
  ('start', 3),
  ('static', 1),
  ('str

In [30]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [31]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.109*"item" + 0.093*"print" + 0.081*"index" + 0.074*"run" + 0.066*"format" '
  '+ 0.063*"size" + 0.054*"reference" + 0.051*"loop" + 0.050*"bit" + '
  '0.046*"standard"'),
 (1,
  '0.146*"branch" + 0.134*"server" + 0.100*"local" + 0.095*"repository" + '
  '0.081*"import" + 0.065*"remote" + 0.050*"side" + 0.042*"design" + '
  '0.019*"git" + 0.017*"necessary"'),
 (2,
  '0.147*"list" + 0.073*"table" + 0.072*"message" + 0.062*"update" + '
  '0.051*"point" + 0.047*"select" + 0.045*"commit" + 0.039*"process" + '
  '0.034*"set" + 0.033*"row"'),
 (3,
  '0.103*"field" + 0.095*"certain" + 0.081*"color" + 0.054*"performance" + '
  '0.053*"suggestion" + 0.050*"red" + 0.048*"background" + 0.037*"closure" + '
  '0.037*"sign" + 0.035*"conversion"'),
 (4,
  '0.067*"also" + 0.061*"question" + 0.051*"application" + 0.047*"however" + '
  '0.043*"different" + 0.033*"content" + 0.026*"issue" + 0.025*"form" + '
  '0.023*"answer" + 0.022*"wrong"'),
 (5,
  '0.129*"command" + 0.089*"true" + 0.068*"reall

In [32]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -14.225221347247492

Coherence Score:  0.40074780391333203


In [33]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

TypeError: ignored

In [None]:
# Install java
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

In [None]:
# Install mallet
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

In [None]:
mallet_path = '/content/mallet-2.0.8/bin/mallet'

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [None]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=20, step=6)

In [None]:
# Show graph
limit=20; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)

    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=texts)

In [None]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()