<a href="https://colab.research.google.com/github/EnsarIshakoglu/NLP/blob/topic-modelling/NLP_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Source: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
# Run in python console
import nltk; nltk.download('stopwords')

# Run in terminal or command prompt
!python3 -m spacy download en

# SKlearn
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_extraction.text import CountVectorizer
# from imblearn.over_sampling import SMOTE

# Data processing
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np

# Stackapi to fetch stackoverflow api
!pip install stackapi
from stackapi import StackAPI

from pprint import pprint

# Gensim
!pip install gensim==3.8.3
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 3.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


### Mount colab drive to google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Fetch data

In [3]:
# Get the data from stackoverflow sorted by votes
def fetch_data():
  data = []
  page_count = 40
  SITE = StackAPI('stackoverflow')

  for i in range(page_count):
    data.append(SITE.fetch('questions', sort="votes", min=20, filter='withbody', page=i + 1))
  
  return data

## Remove useless data

In [4]:
def clean_data(data):
  raw_df = pd.DataFrame(pd.DataFrame.from_dict(data[0]['items']))
  for i in range(len(data) - 1):
    raw_df = raw_df.append(pd.DataFrame.from_dict(data[i + 1]['items']))
  
  df = raw_df[['tags', 'body']]

  # Strip html tags with regex:
  df['body'] = df['body'].str.replace(r'<[^<>]*>', '', regex=True)

  # Get first tag for multi-class classification
  row_count = df.shape[0]

  for i in range(row_count):
    df['tags'].iloc[i] = df['tags'].iloc[i][0]
  
  return df

### Create folder and file from df, unmout drive after

In [5]:
from os.path import exists

!mkdir stackoverflow

if not exists('/content/stackoverflow/questions.csv'):
  data = fetch_data()
  df = clean_data(data)
  df.to_csv('/content/stackoverflow/questions.csv')
  print("Fetched data from stackoverflow, removed the useless data and saved it in stackoverflow/questions.csv")

drive.flush_and_unmount()

mkdir: cannot create directory ‘stackoverflow’: File exists


### Load file from drive

In [6]:
df = pd.read_csv('/content/stackoverflow/questions.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,tags,owner,is_answered,view_count,protected_date,accepted_answer_id,answer_count,score,last_activity_date,creation_date,last_edit_date,question_id,content_license,link,title,body,community_owned_date,locked_date,closed_date,closed_reason,migrated_from
0,0,"['java', 'c++', 'performance', 'cpu-architectu...","{'reputation': 469918, 'user_id': 87234, 'user...",True,1662469,1399067000.0,11227902.0,29,26031,1636434375,1340805096,1634760000.0,11227809,CC BY-SA 4.0,https://stackoverflow.com/questions/11227809/w...,Why is processing a sorted array faster than p...,<p>Here is a piece of C++ code that shows some...,,,,,
1,1,"['git', 'version-control', 'git-commit', 'undo']","{'reputation': 47989, 'user_id': 89904, 'user_...",True,10254767,1370840000.0,927386.0,100,23542,1637838948,1243620554,1625030000.0,927358,CC BY-SA 4.0,https://stackoverflow.com/questions/927358/how...,How do I undo the most recent local commits in...,<p>I accidentally <strong>committed the wrong ...,1363429000.0,,,,
2,2,"['git', 'version-control', 'git-branch', 'git-...","{'reputation': 416782, 'user_id': 95592, 'user...",True,9430056,1358718000.0,2003515.0,41,18667,1638336685,1262653935,1633560000.0,2003505,CC BY-SA 4.0,https://stackoverflow.com/questions/2003505/ho...,How do I delete a Git branch locally and remot...,<p>I want to delete a branch both locally and ...,,,,,
3,3,"['git', 'version-control', 'git-pull', 'git-fe...","{'reputation': 264138, 'user_id': 6068, 'user_...",True,3071149,1362879000.0,292359.0,38,12908,1636728187,1226742669,1606493000.0,292357,CC BY-SA 4.0,https://stackoverflow.com/questions/292357/wha...,What is the difference between &#39;git pull&#...,<p>What are the differences between <code>git ...,,,,,
4,4,"['python', 'iterator', 'generator', 'yield', '...","{'reputation': 131777, 'user_id': 18300, 'user...",True,2731894,1360547000.0,231855.0,43,11664,1636440019,1224800471,1633986000.0,231767,CC BY-SA 4.0,https://stackoverflow.com/questions/231767/wha...,What does the &quot;yield&quot; keyword do?,<p>What is the use of the <code>yield</code> k...,,,,,


In [7]:
print(f"There are {len(df['body'].unique())} rows in the dataset.")

There are 3900 rows in the dataset.


In [8]:
df = df[~df['body'].duplicated()]
print(f"There are {len(df)} rows in the deduplicated dataset.")

There are 3900 rows in the deduplicated dataset.


## Topic modeling

In [9]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## Prepare data

In [15]:
df_new = df[['title', 'body']]
df_new.head()

Unnamed: 0,title,body
0,Why is processing a sorted array faster than p...,<p>Here is a piece of C++ code that shows some...
1,How do I undo the most recent local commits in...,<p>I accidentally <strong>committed the wrong ...
2,How do I delete a Git branch locally and remot...,<p>I want to delete a branch both locally and ...
3,What is the difference between &#39;git pull&#...,<p>What are the differences between <code>git ...
4,What does the &quot;yield&quot; keyword do?,<p>What is the use of the <code>yield</code> k...


In [16]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df_new['body']))

print(data_words[:1])

[['here', 'is', 'piece', 'of', 'code', 'that', 'shows', 'some', 'very', 'peculiar', 'behavior', 'for', 'some', 'strange', 'reason', 'sorting', 'the', 'data', 'em', 'before', 'em', 'the', 'timed', 'region', 'miraculously', 'makes', 'the', 'loop', 'almost', 'six', 'times', 'faster', 'pre', 'class', 'lang', 'cpp', 'prettyprint', 'override', 'code', 'include', 'lt', 'algorithm', 'gt', 'include', 'lt', 'ctime', 'gt', 'include', 'lt', 'iostream', 'gt', 'int', 'main', 'generate', 'data', 'const', 'unsigned', 'arraysize', 'int', 'data', 'arraysize', 'for', 'unsigned', 'lt', 'arraysize', 'data', 'std', 'rand', 'with', 'this', 'the', 'next', 'loop', 'runs', 'faster', 'std', 'sort', 'data', 'data', 'arraysize', 'test', 'clock_t', 'start', 'clock', 'long', 'long', 'sum', 'for', 'unsigned', 'lt', 'for', 'unsigned', 'lt', 'arraysize', 'primary', 'loop', 'if', 'data', 'gt', 'sum', 'data', 'double', 'elapsedtime', 'static_cast', 'lt', 'double', 'gt', 'clock', 'start', 'clocks_per_sec', 'std', 'cout', 

In [17]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['here', 'is', 'piece', 'of', 'code', 'that', 'shows', 'some', 'very', 'peculiar', 'behavior', 'for', 'some', 'strange', 'reason', 'sorting', 'the', 'data', 'em', 'before', 'em', 'the', 'timed', 'region', 'miraculously', 'makes', 'the', 'loop', 'almost', 'six', 'times', 'faster', 'pre', 'class', 'lang', 'cpp', 'prettyprint_override', 'code', 'include', 'lt', 'algorithm', 'gt', 'include', 'lt', 'ctime', 'gt', 'include', 'lt', 'iostream', 'gt', 'int', 'main', 'generate', 'data', 'const', 'unsigned', 'arraysize', 'int', 'data', 'arraysize', 'for', 'unsigned', 'lt', 'arraysize', 'data', 'std', 'rand', 'with', 'this', 'the', 'next', 'loop', 'runs', 'faster', 'std', 'sort', 'data', 'data', 'arraysize', 'test', 'clock_t', 'start', 'clock', 'long', 'long', 'sum', 'for', 'unsigned', 'lt', 'for', 'unsigned', 'lt', 'arraysize', 'primary', 'loop', 'if', 'data', 'gt', 'sum', 'data', 'double', 'elapsedtime', 'static_cast', 'lt', 'double', 'gt', 'clock', 'start', 'clocks_per_sec', 'std_cout', 'lt', '

In [18]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [19]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['piece', 'code', 'show', 'peculiar', 'behavior', 'strange', 'reason', 'sort', 'datum', 'time', 'region', 'miraculously', 'make', 'loop', 'almost', 'time', 'faster', 'pre', 'code', 'include', 'include', 'include', 'main', 'generate', 'datum', 'const', 'unsigne', 'datum', 'loop', 'run', 'fast', 'datum', 'datum', 'test', 'start', 'clock', 'long', 'long', 'sum', 'unsigne', 'datum', 'double', 'elapsedtime', 'lt', 'double', 'clock', 'start', 'datum', 'run', 'sort', 'run', 'sort', 'take', 'time', 'pass', 'array', 'actually', 'worth', 'need', 'calculate', 'unknown', 'initially', 'think', 'may', 'language', 'compiler', 'try', 'random', 'public', 'class', 'main', 'public', 'static', 'void', 'main', 'string', 'arg', 'generate', 'datum', 'int', 'arraysize', 'next', 'run', 'fast', 'test', 'long', 'start', 'system', 'nanotime', 'long', 'sum', 'lt', 'system', 'println', 'system', 'start', 'system', 'pre', 'similar', 'less', 'extreme', 'result', 'first', 'think', 'sort', 'bring', 'data', 'href', 'ht

In [20]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 4), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 3), (12, 1), (13, 1), (14, 1), (15, 1), (16, 8), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 2), (23, 2), (24, 1), (25, 1), (26, 1), (27, 3), (28, 1), (29, 1), (30, 1), (31, 3), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 4), (39, 2), (40, 2), (41, 3), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 4), (57, 2), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 4), (64, 1), (65, 1), (66, 1), (67, 6), (68, 1), (69, 4), (70, 1), (71, 1), (72, 1), (73, 2), (74, 3), (75, 4), (76, 1), (77, 1), (78, 2), (79, 3), (80, 3), (81, 1), (82, 1), (83, 2), (84, 2), (85, 1), (86, 1)]]


In [21]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('actually', 1),
  ('almost', 1),
  ('amp', 1),
  ('arg', 1),
  ('array', 4),
  ('arraysize', 1),
  ('behavior', 1),
  ('bring', 1),
  ('calculate', 1),
  ('class', 1),
  ('clock', 2),
  ('code', 3),
  ('com', 1),
  ('compiler', 1),
  ('const', 1),
  ('data', 1),
  ('datum', 8),
  ('different', 1),
  ('double', 2),
  ('effect', 1),
  ('elapsedtime', 1),
  ('extreme', 1),
  ('fast', 2),
  ('faster', 2),
  ('first', 1),
  ('flag', 1),
  ('followup', 1),
  ('generate', 3),
  ('go', 1),
  ('href', 1),
  ('https', 1),
  ('include', 3),
  ('independent', 1),
  ('initially', 1),
  ('int', 1),
  ('language', 1),
  ('late', 1),
  ('less', 1),
  ('long', 4),
  ('loop', 2),
  ('lt', 2),
  ('main', 3),
  ('make', 2),
  ('matter', 1),
  ('may', 1),
  ('miraculously', 1),
  ('nanotime', 1),
  ('need', 1),
  ('next', 1),
  ('optimization', 1),
  ('order', 1),
  ('pass', 1),
  ('peculiar', 1),
  ('piece', 1),
  ('pre', 2),
  ('println', 1),
  ('processing', 4),
  ('public', 2),
  ('random', 1),
  ('

In [22]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [23]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.213*"com" + 0.127*"question" + 0.037*"idea" + 0.034*"answer" + '
  '0.026*"syntax" + 0.025*"search" + 0.024*"help" + 0.024*"support" + '
  '0.024*"correct" + 0.021*"bit"'),
 (1,
  '0.369*"file" + 0.068*"path" + 0.060*"project" + 0.048*"exist" + '
  '0.042*"directory" + 0.035*"source" + 0.029*"open" + 0.024*"log" + '
  '0.023*"thread" + 0.021*"add"'),
 (2,
  '0.191*"version" + 0.141*"number" + 0.105*"remove" + 0.079*"part" + '
  '0.068*"stre" + 0.056*"state" + 0.036*"back" + 0.034*"framework" + '
  '0.030*"previous" + 0.025*"core"'),
 (3,
  '0.148*"table" + 0.097*"like" + 0.090*"second" + 0.087*"default" + '
  '0.084*"null" + 0.074*"row" + 0.051*"insert" + 0.036*"select" + 0.035*"mode" '
  '+ 0.025*"section"'),
 (4,
  '0.069*"find" + 0.051*"make" + 0.048*"see" + 0.039*"could" + 0.036*"go" + '
  '0.032*"application" + 0.028*"difference" + 0.028*"thing" + 0.027*"think" + '
  '0.027*"different"'),
 (5,
  '0.263*"character" + 0.078*"feature" + 0.053*"side" + 0.044*"operator" + '
 

In [24]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -12.13765196906999

Coherence Score:  0.392433019134487


In [25]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [28]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = 'content/mallet-2.0.8/bin/mallet' # update this path
!unzip mallet_path.zip
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

CalledProcessError: ignored

In [26]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [27]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

NameError: ignored