# Data Modelisation

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from gensim.utils import ClippedCorpus
import gensim
import tqdm
from gensim.models import Phrases
from gensim.corpora import Dictionary
from nltk.tokenize import RegexpTokenizer
from gensim.models import LdaModel, CoherenceModel, LdaMulticore, TfidfModel, Nmf
from nlp_module import remove_stopwords, plot_top_words
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

## 1) Data preparation

In [2]:
data = pd.read_csv("datasets/posts_clean.csv")
data.head(3)

Unnamed: 0,Id,Title,Body,Tags
0,415192,good way create simple python web service,I use python year I little experience python w...,<python><web-services>
1,415344,log implementation prefer,I implement log class c try decide I curious k...,<debugging><language-agnostic><logging>
2,414981,directly modify list element,I struct struct map public int size public map...,<c#><.net>


In [3]:
# remove < and > around Tags
data["Tags"] = data["Tags"].replace({"<" : " "}, regex=True)
data["Tags"] = data["Tags"].replace({">" : " "}, regex=True)

In [4]:
data["Body"] = data["Body"].str.lower()
data["Body"] = data["Body"].apply(remove_stopwords)

In [5]:
data.head(3)

Unnamed: 0,Id,Title,Body,Tags
0,415192,good way create simple python web service,use python year little experience python web p...,python web-services
1,415344,log implementation prefer,implement log class c try decide curious know ...,debugging language-agnostic logging
2,414981,directly modify list element,struct struct map public int size public map i...,c# .net


In [6]:
docs = data["Body"].to_list()

In [7]:
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [8]:
# Compute bigrams.
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [9]:
# Remove rare and common tokens.
# Create a dictionary representation of the documents.
dct = Dictionary(docs)
dct.filter_extremes(no_below=20, no_above=0.5)

In [10]:
# Bag-of-words representation of the documents.
corpus = [dct.doc2bow(doc) for doc in docs]

In [11]:
print('Number of unique tokens: %d' % len(dct))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 25556
Number of documents: 164598


## 2) Topic Modelling

In [12]:
# Set training parameters.
chunksize = 2000
passes = 10
iterations = 200
eval_every = None

# Make a index to word dictionary.
temp = dct[0]  # This is only to "load" the dictionary.
id2word = dct.id2token

### a) Non Negative Matrix Factorization (NMF)

In [15]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf = tfidf_vectorizer.fit_transform(data["Body"])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [30]:
corpus_nmf = tfidf.toarray()

In [31]:
corpus_nmf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.09108897, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [28]:
# Create a list of the topic numbers we want to try
topic_nums = list(np.arange(2, 10 + 1, 1))

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []

for num in topic_nums:
    nmf = Nmf(corpus=corpus_nmf,
              num_topics=num,
              id2word=id2word,
              chunksize=chunksize,
              passes=passes,
              kappa=.1,
              minimum_probability=0.01,
              w_max_iter=300,
              w_stop_condition=0.0001,
              h_max_iter=100,
              h_stop_condition=0.001,
              eval_every=eval_every,
              normalize=True,
              random_state=42)
    
# Run the coherence model to get the score
    cm = CoherenceModel(
        model=nmf,
        texts=docs,
        dictionary=dct,
        coherence='c_v')
    
    coherence_scores.append(round(cm.get_coherence(), 5))

# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=lambda elem: elem[1], reverse=True)[0][0]

print("Best number of topics:", best_num_topics)

TypeError: 'numpy.bool_' object is not iterable

In [None]:
nmf = Nmf(corpus,
          id2word = id2word,
          chunksize = 2000,
          num_topics=best_num_topics,
          kappa=0.1)

In [None]:
coherence_model_nmf = CoherenceModel(model=nmf,
                                     texts=docs,
                                     dictionary=dct,
                                     coherence='c_v')
coherence_nmf = coherence_model_nmf.get_coherence()
print('Coherence Score: ', coherence_nmf)

In [None]:
n_topics=best_num_topics
n_top_words = 30

nmf = NMF(n_components=n_topics, 
          random_state=42, alpha=.1, 
          l1_ratio=.5).fit(tfidf)
nmf_embedding = nmf.transform(tfidf)

feature_names = tfidf_vectorizer.get_feature_names()

print("Topics found via NMF:")

for topic_idx, topic in enumerate(nmf.components_):
    print("\nTopic {}:".format(topic_idx))
    print(" ".join(['[{}]'.format(feature_names[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()

In [None]:
plot_top_words(nmf, tfidf_feature_names, n_top_words, "Topics in NMF model")

### b) Latent Dirichlet Allocation (LDA)

In [None]:
# Create a list of the topic numbers we want to try
topic_nums = list(np.arange(2, 10 + 1, 1))

# Run the nmf model and calculate the coherence score
# for each number of topics
coherence_scores = []

for num in topic_nums:
    lda = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=num, 
                         random_state=42,
                         chunksize=chunksize,
                         passes=passes,
                         iterations=iterations,
                         alpha="auto",
                         eta="auto",
                         eval_every=eval_every)
    
# Run the coherence model to get the score
    cm = CoherenceModel(
        model=lda,
        texts=docs,
        dictionary=dct,
        coherence='c_v')
    
    coherence_scores.append(round(cm.get_coherence(), 5))

# Get the number of topics with the highest coherence score
scores = list(zip(topic_nums, coherence_scores))
best_num_topics = sorted(scores, key=lambda elem: elem[1], reverse=True)[0][0]

print("Best number of topics:", best_num_topics)

In [None]:
lda = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=best_num_topics, 
                     random_state=42,
                     chunksize=chunksize,
                     passes=passes,
                     iterations=iterations,
                     alpha="auto",
                     eta="auto",
                     eval_every=eval_every)

In [None]:
coherence_model_lda = CoherenceModel(model=lda, texts=docs, dictionary=dct, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
lda_vis = gensimvis.prepare(lda, corpus, dct)
lda_vis

## 3) Supervised learning for text classification

In [None]:
from sklearn.model_selection import train_test_split

X =
y = 

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.75, 
                                                    stratify=y)

### a) Support Vector Machine

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

params = {"kernel" : ["linear", "rbf", "poly", "sigmoid"],
          "decision_function_shape" : ["ovr", "ovo"]}

grid_search = GridSearchCV(svm_clf, 
                           param_grid=params, 
                           scoring="accuracy",
                           cv=5,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_params_

### b) Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_clf = MultinomialNB(random_state=42)