# Data Modelisation

- Configuration

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from nlp_module import remove_stopwords, plot_top_words, tok, print_evaluation_scores
import sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pyLDAvis
import pyLDAvis.sklearn

In [None]:
sklearn.set_config(display="diagram")

## 1) Data preparation

In [None]:
path = "datasets/posts_clean.csv"

In [None]:
data = pd.read_csv(path)
data.head(3)

In [None]:
# remove < and > around Tags
data["Tags"] = data["Tags"].replace({"<" : " "}, regex=True)
data["Tags"] = data["Tags"].replace({">" : " "}, regex=True)

In [None]:
data["Body"] = data["Body"].str.lower()
data["Body"] = data["Body"].apply(remove_stopwords)

In [None]:
data.isnull().mean()

In [None]:
data.dropna(inplace=True)

In [None]:
data.head(3)

In [None]:
data = data.sample(frac=0.1, random_state=42)

In [None]:
docs = data["Title"] + " " + data["Body"]

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=12000)
tfidf = tfidf_vectorizer.fit_transform(docs)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [None]:
# Materialize the sparse data
data_dense = tfidf.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", (((data_dense > 0).sum()/data_dense.size)*100).round(3), "%")

## 2) Topic Modelling

### a) Non Negative Matrix Factorization (NMF)

In [None]:
best_num_topics = 10

In [None]:
nmf = NMF(n_components=best_num_topics,
          random_state=42,
          alpha=0.1)

In [None]:
nmf.fit(tfidf)

In [None]:
plot_top_words(nmf, tfidf_feature_names, n_top_words=30, title="Topics in NMF model")

### b) Latent Dirichlet Allocation (LDA)

In [None]:
bow_vectorizer = CountVectorizer(ngram_range = (1,1),
                                 tokenizer=tok)
docs_bow = bow_vectorizer.fit_transform(docs)

In [None]:
lda = LatentDirichletAllocation(random_state=42,
                                batch_size=400)

In [None]:
params = {'n_components' : [7, 8, 9, 10, 11, 12],
          "learning_decay" : [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
          "learning_method" : ["batch", "online"]}

gridsearch_lda = GridSearchCV(lda,
                              param_grid=params,
                              cv=5,
                              verbose=1)
gridsearch_lda.fit(docs_bow)

In [None]:
best_lda_model = gridsearch_lda.best_estimator_

print("Best Model's Params: ", gridsearch_lda.best_params_)
print("Best Log Likelihood Score: ", gridsearch_lda.best_score_)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, docs_bow, bow_vectorizer, mds='tsne')
panel

- Topic 1 : OS - Top topic
- Topic 2 : Script Language/Python
- Topic 3 : Web language/Front-end
- Topic 4 : Java
- Topic 5 : Web language/Back-end
- Topic 6 : SQL
- Topic 7 : Script Language

## 3) Supervised learning for text classification

- CRF (Conditional Random Fields)
- Données séquentielles

In [None]:
tags = data[["Id", "Tags"]]

In [None]:
tags = tags["Tags"].str.split(expand=True)
tags['Id'] = data["Id"]
tags.columns = ["1st", "2nd", "3rd", "4th", "5th", "Id"]
tags = tags[["Id", "1st", "2nd", "3rd", "4th", "5th"]]
tags.head()

In [None]:
tags.fillna("",inplace=True)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

new_tags=pd.DataFrame(mlb.fit_transform(tags[["1st", "2nd", "3rd", "4th", "5th"]].values),
                      columns=mlb.classes_,
                      index=tags["Id"])

In [None]:
from sklearn.model_selection import train_test_split

X = docs
y = new_tags

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.25)

### a) Support Vector Machine

In [None]:
vectorizer = TfidfVectorizer(ngram_range =(1,1),
                            tokenizer=tok,
                            max_features=12000)
vectorizer.fit(X_train)
X_train_vec = vectorizer.transform(X_train)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

svm_clf = OneVsRestClassifier(SVC(random_state=42))
svm_clf.fit(X_train_vec, y_train)

In [None]:
X_test_vec = vectorizer.transform(X_test)
y_pred = svm_clf.predict(X_test_vec)

In [None]:
print_evaluation_scores(y_test, y_pred)

- Grid Search CV

In [None]:
from sklearn.metrics import make_scorer, f1_score

scorer_multilabel = make_scorer(f1_score, average="weighted")

In [None]:
import scipy.stats as stats

params = {"estimator__C": stats.loguniform(1e-5, 100), 
          "estimator__gamma":["auto", "scale"],
          "estimator__class_weight" : ["balanced"]}

rnd_search = RandomizedSearchCV(estimator=svm_clf, 
                           param_distributions=params, 
                           scoring=scorer_multilabel,
                           cv=5,
                           verbose=1)
rnd_search.fit(X_train_vec, y_train)

In [None]:
rnd_search.best_estimator_

In [None]:
rnd_search.best_params_

In [None]:
params = {"estimator__C": [],
          "estimator__gamma": [],
          "estimator__class_weight" : []}

gridsearch_svm = GridSearchCV(estimator=svm_clf, 
                             param_grid=params, 
                             scoring=scorer_multilabel,
                             cv=5,
                             verbose=1)
gridsearch_svm.fit(X_train_vec, y_train)

In [None]:
gridsearch_svm.best_estimator_

In [None]:
gridsearch_svm.best_params_

In [None]:
best_model_svm = gridsearch_svm.best_estimator_

- Model Evaluation

In [None]:
y_pred = best_model_svm.predict(X_test_vec)

In [None]:
print_evaluation_scores(y_test, y_pred)

### b) Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_clf = OneVsRestClassifier(MultinomialNB(fit_prior=True, 
                                           class_prior=None))
nb_clf.fit(X_train_vec, y_train)

In [None]:
y_pred_nb = nb_clf.predict(X_test_vec)

In [None]:
print_evaluation_scores(y_test, y_pred_nb)

- Grid Search

In [None]:
params = {"estimator__alpha": [0.3, 0.5, 0.7, 0.9, 1.0]}

gridsearch_nb = GridSearchCV(estimator=nb_clf, 
                             param_grid=params, 
                             scoring=scorer_multilabel,
                             cv=5,
                             verbose = 1)
gridsearch_nb.fit(X_train_vec, y_train)

In [None]:
gridsearch_nb.best_estimator_

In [None]:
gridsearch_nb.best_params_

In [None]:
best_model_nb = gridsearch_nb.best_estimator_

- Model Evaluation

In [None]:
y_pred_nb = best_model_nb.predict(X_test_vec)

In [None]:
print_evaluation_scores(y_test, y_pred_nb)

## 4) Deep Learning for text classification

#### a) Using Word2vec + LSTM

In [None]:
import string

from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential

In [None]:
X = tokenizer.texts_to_sequences(docs)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

- Word2vec

In [None]:
documents = 

In [None]:
sentences = [[word for word in document.lower().split()] for document in documents]

In [None]:
word_model = gensim.models.Word2Vec(sentences, vector_size=100, min_count=1, 
                                    window=5, iter=100)

- LSTM

In [None]:
MAX_NB_WORDS = 12000
MAX_SEQUENCE_LENGTH = 
EMBEDDING_DIM = 100

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, 
                    EMBEDDING_DIM, 
                    input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, 
               dropout=0.2, 
               recurrent_dropout=0.2))
model.add(Dense(13, activation='softmax'))
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss',
                                             patience=3,
                                             min_delta=0.0001)])

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();