# Suggestion de Tags - Approche non supervisée

#### Divers: Imports, options

In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 130

In [2]:
# suppression des "FutureWarning"
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import time as time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

In [4]:
pd.set_option('display.max_columns', None)

## Chargement Données

In [5]:
df = pd.read_csv("questions_clean.csv", index_col=0)
df

Unnamed: 0,Title,Body,Tags,Text
0,java record array,give follow code public static void main strin...,<java><arrays><java-14><java-record>,java record array give follow code public stat...
1,blue flicker stripe mbp chrome,new macbook pro late catalina recently find we...,<google-chrome><graphics><macos-catalina>,blue flicker stripe mbp chrome new macbook pro...
2,create dependson relation ec rds use aws cdk,currently use aws cdk typescript create stack ...,<amazon-web-services><amazon-ec2><amazon-rds><...,create dependson relation ec rds use aws cdk c...
3,colab class weight cause valueerror truth valu...,run cnn kera sequential google colab get follo...,<python><tensorflow><keras><google-colaborator...,colab class weight cause valueerror truth valu...
4,pas custom function inside foreach object para...,find way pas function variable idea without pu...,<powershell><powershell-core>,pas custom function inside foreach object para...
...,...,...,...,...
8064,daemon stop immediately jvm garbage collector ...,build multi module java project use gradle add...,<java><gradle>,daemon stop immediately jvm garbage collector ...
8065,difference mono flux fromcallable mono defer,usually requirement generate mono flux whose v...,<spring-webflux><project-reactor>,difference mono flux fromcallable mono defer u...
8066,compare generic structs c,want compare structs generic way do something ...,<c++><templates><struct><padding><memcmp>,compare generic structs c want compare structs...
8067,set custom http error kubernetes,want create custom error page currently alread...,<kubernetes><kubernetes-ingress>,set custom http error kubernetes want create c...


In [6]:
df['Text'][0]

'java record array give follow code public static void main string args record foo int ints var ints new int var foo new foo ints system println foo foo ints system println new foo new int equal new foo new int false system println new foo ints equal new foo ints true system println foo equal foo true seem obviously array tostring equal method use instead static method array equal array deepequals array tostring guess java record jep work well arrays respective method generate ide least intellij default generates useful method e use static method array solution'

## Tokenization

In [7]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
df['Text_tok'] = df['Text'].map(tokenizer.tokenize)

In [8]:
df['FreqDist'] = df['Text_tok'].map(nltk.FreqDist)
df['WordCount'] = df['Text_tok'].map(len)
df['UniqueWordCount'] = df['FreqDist'].map(len)

In [9]:
df

Unnamed: 0,Title,Body,Tags,Text,Text_tok,FreqDist,WordCount,UniqueWordCount
0,java record array,give follow code public static void main strin...,<java><arrays><java-14><java-record>,java record array give follow code public stat...,"[java, record, array, give, follow, code, publ...","{'java': 2, 'record': 3, 'array': 6, 'give': 1...",97,44
1,blue flicker stripe mbp chrome,new macbook pro late catalina recently find we...,<google-chrome><graphics><macos-catalina>,blue flicker stripe mbp chrome new macbook pro...,"[blue, flicker, stripe, mbp, chrome, new, macb...","{'blue': 2, 'flicker': 2, 'stripe': 2, 'mbp': ...",58,49
2,create dependson relation ec rds use aws cdk,currently use aws cdk typescript create stack ...,<amazon-web-services><amazon-ec2><amazon-rds><...,create dependson relation ec rds use aws cdk c...,"[create, dependson, relation, ec, rds, use, aw...","{'create': 2, 'dependson': 2, 'relation': 1, '...",70,43
3,colab class weight cause valueerror truth valu...,run cnn kera sequential google colab get follo...,<python><tensorflow><keras><google-colaborator...,colab class weight cause valueerror truth valu...,"[colab, class, weight, cause, valueerror, trut...","{'colab': 2, 'class': 11, 'weight': 8, 'cause'...",447,187
4,pas custom function inside foreach object para...,find way pas function variable idea without pu...,<powershell><powershell-core>,pas custom function inside foreach object para...,"[pas, custom, function, inside, foreach, objec...","{'pas': 2, 'custom': 1, 'function': 4, 'inside...",36,23
...,...,...,...,...,...,...,...,...
8064,daemon stop immediately jvm garbage collector ...,build multi module java project use gradle add...,<java><gradle>,daemon stop immediately jvm garbage collector ...,"[daemon, stop, immediately, jvm, garbage, coll...","{'daemon': 16, 'stop': 3, 'immediately': 2, 'j...",144,50
8065,difference mono flux fromcallable mono defer,usually requirement generate mono flux whose v...,<spring-webflux><project-reactor>,difference mono flux fromcallable mono defer u...,"[difference, mono, flux, fromcallable, mono, d...","{'difference': 3, 'mono': 9, 'flux': 2, 'fromc...",69,42
8066,compare generic structs c,want compare structs generic way do something ...,<c++><templates><struct><padding><memcmp>,compare generic structs c want compare structs...,"[compare, generic, structs, c, want, compare, ...","{'compare': 4, 'generic': 3, 'structs': 4, 'c'...",103,69
8067,set custom http error kubernetes,want create custom error page currently alread...,<kubernetes><kubernetes-ingress>,set custom http error kubernetes want create c...,"[set, custom, http, error, kubernetes, want, c...","{'set': 1, 'custom': 5, 'http': 7, 'error': 5,...",183,83


## Analyse exploratoire : Visualisations

In [None]:
%%time

# Affichage des fréquences
dfPlot = df.sort_values(by="WordCount")[['UniqueWordCount', 'WordCount']]
display(dfPlot)
dfPlot.plot(kind='bar', title='Top 50 post par nombre de mots', stacked=True, ylim=(0, 1500))
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False) # labels along the bottom edge are off
plt.show()

Unnamed: 0,UniqueWordCount,WordCount
2476,6,8
3024,7,9
3191,10,10
5464,8,10
3299,8,11
...,...,...
2405,388,2913
7546,219,2914
5603,351,3095
4824,680,3143


In [None]:
def var_desc(feat, df, text=False, hist=False, boxplot=False, bpShowFliers=False):
    if text:
        print("Description basique de", feat)
        print("  min max", df[feat].min(), df[feat].max())
        # print("val_counts",df[feat].value_counts())
        print("  moyenne", round(df[feat].mean(), 3))
        print("  medianne", df[feat].median())
        # truc moche pour eviter d'avoir le type de la serie renvoyée par mode()
        modes = df[feat].mode()
        print("  mode ", end='')
        for m in modes:
            print(m, end=' ')
        print(" ")
        print("  variance empirique", round(df[feat].var(ddof=0), 3))
        print("  ecart-type", round(df[feat].std(ddof=0), 3))
    if hist:
        df[feat].hist(figsize=(8, 6), bins=100, legend=True)
        plt.show()
    if boxplot:
        medianprops = {'color': "black"}
        meanprops = {'marker': 'o', 'markeredgecolor': 'black', 'markerfacecolor': 'firebrick'}
        fig1, ax1 = plt.subplots(figsize=(4, 3), dpi=80)
        ax1.set_title(feat)
        ax1.boxplot(df[feat].dropna(), vert=False, showfliers=bpShowFliers, medianprops=medianprops,
                    patch_artist=True, showmeans=True, meanprops=meanprops)
        plt.show()

In [None]:
var_desc('WordCount', df, text=True, boxplot=True, bpShowFliers=False)
var_desc('WordCount', df, text=False, boxplot=True, bpShowFliers=True)

In [None]:
var_desc('UniqueWordCount', df, text=True, boxplot=True, bpShowFliers=False)
var_desc('UniqueWordCount', df, text=False, boxplot=True, bpShowFliers=True)

In [None]:
dfPlot.plot('WordCount', 'UniqueWordCount')
plt.show()

In [None]:
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join(list(df['Text'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue', width=800, height=400)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

## Tf-Idf pour KMeans

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

vectorizer = TfidfVectorizer(
    max_df=0.5,
    # max_features=opts.n_features,
    min_df=2,
    stop_words="english",
    use_idf=True,
)

In [None]:
%%time

X = vectorizer.fit_transform(df['Text'])

In [None]:
print("n_samples: %d, n_features: %d" % X.shape)


In [None]:
%%time

# Elbow method 
# --------------- /!\ c'est tres long selon la plage choisie /!\ ---------------
import seaborn as sns

if False:
    sse = {}
    for k in range(10, 400, 50):
        print(k, end=" ")
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        sse[k] = kmeans.inertia_  # SSE to closest cluster centroid
    plt.title('The Elbow Method')
    plt.xlabel('k')
    plt.ylabel('SSE')
    sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
    plt.show()

In [None]:
# on choisi 40 clusters d'apres elbow
nk = 40

In [None]:
%%time

km = KMeans(
    n_clusters=nk,
    init="k-means++",
    max_iter=100,
    n_init=1
)
print("Clustering sparse data with %s" % km)
km.fit(X)

In [None]:
%%time

print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))

In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(nk):
    print("Topic %d:" % i, end="")
    for ind in order_centroids[i, :10]:
        print(" %s" % terms[ind], end="")
    print()


## bag-of-word with LDA

In [None]:
%%time

from gensim.corpora.dictionary import Dictionary
from gensim import models

# Create a corpus from a list of texts
g_text = df['Text_tok'].to_list()
dictionary = Dictionary(g_text)
bow_corpus = [dictionary.doc2bow(text) for text in g_text]

In [None]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
len(dictionary)

In [None]:
%%time

# Train the model on the corpus.
lda = models.LdaModel(bow_corpus, num_topics=10, id2word = dictionary)

In [None]:
lda.get_topics().shape

In [None]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")


In [None]:
import pyLDAvis as pyLDAv
import pyLDAvis.gensim_models as gensimvis

pyLDAv.enable_notebook()
gensimvis.prepare(lda, bow_corpus, dictionary)

In [None]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=g_text, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

#### Avec Mallet's  LDA

In [None]:
import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME': r'C:\Users\beb4e\Documents\OpenClassrooms\P5\mallet-2.0.8\mallet-2.0.8'})
# You should update this path as per the path of Mallet directory on your system.
mallet_path = r'C:\Users\beb4e\Documents\OpenClassrooms\P5\mallet-2.0.8\mallet-2.0.8\bin\mallet'
# You should update this path as per the path of Mallet directory on your system.

In [None]:
%%time

mallet_lda = models.wrappers.LdaMallet(mallet_path, corpus=bow_corpus, num_topics=10, id2word=dictionary)

In [None]:
# on converti le modele Mallet's LDA en LDA, pour l'affichage: pyLDAvis n'aime pas les modeles Mallet...
mallet_lda_backTo_lda = models.wrappers.ldamallet.malletmodel2ldamodel(mallet_lda)
gensimvis.prepare(mallet_lda_backTo_lda, bow_corpus, dictionary)

In [None]:
# Compute Perplexity
# erreur avec Mallet, tant pis pour cette mesure

# Compute Coherence Score
coherence_model_mallet_lda = CoherenceModel(model=mallet_lda, texts=g_text, dictionary=dictionary, coherence='c_v')
coherence_mallet_lda = coherence_model_mallet_lda.get_coherence()
print('\nCoherence Score: ', coherence_mallet_lda)

### recherche du nombre de topic optimal

In [None]:
def coherencePlot(coherence_values, start, limit, step):
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(["coherence_values"], loc='best')
    plt.show()

In [None]:
def compute_coherence_values(dictionary, corpus, texts, with_mallet, limit, start=2, step=3, random_state=42):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        if with_mallet:
            model = models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary,
                                              random_seed=random_state)
        else:
            model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=random_state)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
%%time

model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=g_text,
                                                        with_mallet=False,
                                                        start=2, limit=40, step=6)

In [None]:
# Show graph
coherencePlot(coherence_values, 2, 40, 6)

In [None]:
%%time

model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=g_text,
                                                        with_mallet=True,
                                                        start=2, limit=40, step=6)

In [None]:
# Show graph
coherencePlot(coherence_values, 2, 40, 6)

In [None]:
%%time

# vu les resultats précedents, on affine la recherche 
limit = 25
start = 5
step = 1


print("----------- LDA -------------")
model_list_lda, coherence_values_lda = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=g_text,
                                                        with_mallet=False,
                                                        start=start, limit=limit, step=step)
coherencePlot(coherence_values_lda, start, limit, step)


print("----------- Mallet LDA ------")
model_list_mlda, coherence_values_mlda = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=g_text,
                                                        with_mallet=True,
                                                        start=start, limit=limit, step=step)
coherencePlot(coherence_values_mlda, start, limit, step)

In [None]:
# selection du meilleur modele
print("Modele: ", end="")
max_lda = max(coherence_values_lda)
max_mlda = max(coherence_values_mlda)
if(max_lda > max_mlda):
    idx = coherence_values_lda.index(max_lda)
    optimal_model = model_list_lda[idx]
    print("LDA (%f)" % max_lda)
else:
    idx = coherence_values_mlda.index(max_mlda)
    optimal_model = model_list_mlda[idx]
    print("Mallet LDA (%f)" % max_mlda)

n_topic = idx+start
print("Nombre de topics:", n_topic)

In [None]:

mallet_lda_backTo_lda = models.wrappers.ldamallet.malletmodel2ldamodel(optimal_model)
gensimvis.prepare(mallet_lda_backTo_lda, bow_corpus, dictionary)

## Finding the dominant topic in each sentence

Le 'Taggage' a proprement parler, meme si on a pas de 'tag' pour le moment, juste une liste de mots clés

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]),
                                                       ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=bow_corpus, texts=g_text)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic

## Find the most representative document for each topic

TODO: selectionner les X premiers mots clés et/ou dont poids supérieur à un seuil

      remettre ca avec le vrai doc original, plus lisible, pour en extraire le (ou les) tag pertinent


In [None]:
# on charge l'original, car on se sert de la version lisible pour creer des tags pertinents
df_orig = pd.read_csv("Questions.csv")
df_orig

In [None]:
df_topic_sents_keywords_readable = pd.concat([df_orig, df_topic_sents_keywords], axis=1)
df_topic_sents_keywords_readable

In [None]:
# Group Top X sentences under each topic
num_TopSentence = 2

sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(num_TopSentence)],
                                            axis=0)

# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

In [None]:
# Group Top X sentences under each topic, version lisible
def readableBestNQuestions(n=1):
    num_TopSentence = n

    sent_topics_sorteddf_mallet = pd.DataFrame()

    sent_topics_outdf_grpd = df_topic_sents_keywords_readable.groupby('Dominant_Topic')

    for i, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                                 grp.sort_values(['Perc_Contribution'], ascending=[0]).head(num_TopSentence)],
                                                axis=0)

    # Reset Index
    sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

    # Format
    sent_topics_sorteddf_mallet.columns = ['Id', 'Title', 'Body', 'ExistingTags',
                                           'Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

    # Show
    return sent_topics_sorteddf_mallet

In [None]:
def printBold(text):
    print('\033[1m{:10s}\033[0m'.format(text))

In [None]:
from IPython.display import display, HTML
from bs4 import BeautifulSoup


def showReadableDesc(topic, n, showBody=True, showHtml=False):
    # Get Top n sentences for topic
    readable = readableBestNQuestions(n)
    res = readable[readable['Topic_Num'] == topic]

    for i in range(0, n):
        printBold("Topic "+str(topic)+" exemple "+str(i+1)+"/"+str(n))
        print("Keywords:", res['Keywords'].iloc[0])
        print("  Id:", res['Id'].iloc[i], "Contrib:", res['Topic_Perc_Contrib'].iloc[i])
        print("  StackOverflow Tags:", res['ExistingTags'].iloc[i])
        print("  Titre:", res['Title'].iloc[i])
        printBold("**********************************************************************")
        if showBody and showHtml:
            display(HTML(res['Body'].iloc[i]))
        elif showBody:
            print(BeautifulSoup(res['Body'].iloc[i], 'html.parser').get_text())


showReadableDesc(1, 2, showBody=False)

In [None]:
for i in range(0, n_topic):
    showReadableDesc(i, 3, showBody=False)

## Topic distribution across documents

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# /!\ modif perso parce que y'a un bug dans le truc...
topic_num_keywords = topic_num_keywords.drop_duplicates(subset=['Dominant_Topic']).reset_index(drop=True)

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

df_dominant_topics = df_dominant_topics.set_index('Dominant_Topic')


# Show
df_dominant_topics.sort_values(['Perc_Documents'], ascending=[0])