In [1]:
import numpy as np
import pandas as pd
from gensim import corpora # https://pypi.org/project/gensim/ #https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html#sphx-glr-auto-examples-core-run-topics-and-transformations-py
from sklearn import (
    metrics,
    feature_extraction,
    linear_model,
    model_selection,
    preprocessing,
    cluster,
    multiclass,
    svm,
)

In [2]:
pip install jupyter-black jupyter

Collecting jupyter-black
  Downloading jupyter_black-0.3.3-py3-none-any.whl (8.3 kB)
Collecting tokenize-rt>=4.1.0
  Downloading tokenize_rt-5.0.0-py2.py3-none-any.whl (5.8 kB)
Installing collected packages: tokenize-rt, jupyter-black
Successfully installed jupyter-black-0.3.3 tokenize-rt-5.0.0
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
# for black formating
import jupyter_black  # pip install jupyter-black jupyter

jupyter_black.load()
# ??jupyter_black.load #show config

In [4]:
df = pd.read_csv(
    "/kaggle/input/p5-nlp/stackoverflowCleaned.csv", index_col=0, low_memory=False
)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118442 entries, 0 to 118441
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Id           118442 non-null  int64 
 1   Title        118442 non-null  object
 2   Body         118442 non-null  object
 3   0            118440 non-null  object
 4   1            118383 non-null  object
 5   2            118292 non-null  object
 6   3            118292 non-null  object
 7   4            76389 non-null   object
 8   5            4 non-null       object
 9   Title_clean  118389 non-null  object
 10  Code         88202 non-null   object
 11  Body_clean   118392 non-null  object
dtypes: int64(1), object(11)
memory usage: 11.7+ MB


In [5]:
df["0"]

0                java
1                 git
2                 git
3                 git
4                 c++
             ...     
118437         iphone
118438    google-maps
118439     javascript
118440           java
118441             c#
Name: 0, Length: 118442, dtype: object

In [6]:
from ipywidgets import interact


@interact(i=(0, df.shape[0] - 1))
def test_tr_print(i):
    print(
        f'Title:{df["Title"][i]}\n{"*"*30}\nTitle_clean:{df["Title_clean"][i]}\n{"*"*30}\nBody:{df["Body"][i]}\n{"*"*30}\nBody_clean:{df["Body_clean"][i]}\n{"*"*30}\nCode:{df["Code"][i]}\n{"*"*30}\nTags:{df.loc[i,"0"]} {df.loc[i,"1"]} {df.loc[i,"2"]} {df.loc[i,"3"]} {df.loc[i,"4"]}'
    )

interactive(children=(IntSlider(value=59220, description='i', max=118441), Output()), _dom_classes=('widget-in…

In [7]:
sum(df["Title_clean"].isna())

53

In [8]:
df[df["Title_clean"].isna()][
    ["Title", "Title_clean", "Body", "Body_clean", "0", "1", "2", "3", "4"]
]

Unnamed: 0,Title,Title_clean,Body,Body_clean,0,1,2,3,4
302,Is < faster than <=?,,<p>Is <code>if (a &lt; 901)</code> faster than...,example performance change loop code suppose g...,c++,c,performance,assembly,relational-operators
393,What is setup.py?,,<p>What is <code>setup.py</code> and how can i...,configure,python,pypi,setup.py,python-packaging,
1437,What is related_name used for?,,<p>What is the <code>related_name</code> argum...,argument manytomanyfield foreignkey field exam...,python,django,foreign-keys,many-to-many,
4062,"What is ""android:allowBackup""?",,"<p>Since the <a href=""http://tools.android.com...",adt preview version version lint warning tell ...,android,adt,compiler-warnings,android-lint,android-backup-service
4870,Why is 0 < -0x80000000?,,<p>I have below a simple program:</p>\n\n<pre>...,program condition if(bal int32_min work change...,c,signed,numeric-limits,numeric-conversion,
5007,Make UINavigationBar transparent,,<p>How do you make a <strong>UINavigationBar t...,uinavigationbar want bar item remain,ios,iphone,objective-c,uinavigationbar,transparency
7467,"vim ""modifiable"" is off",,<p>I am trying to create a new file with NERDT...,try create file nerdtree hit key create file m...,vim,vi,macvim,nerdtree,
9122,~x + ~y == ~(x + y) is always false?,,<p>Does this code always evaluate to false? Bo...,code evaluate variable complement sign int fee...,c,bit-manipulation,signed,twos-complement,
12942,'do...while' vs. 'while',,<blockquote>\n <p><strong>Possible Duplicates...,duplicate vs. use loop program year work year ...,c#,c++,c,while-loop,do-while
15258,"Hashable, immutable",,"<p>From a recent SO question (see <a href=""htt...",question create dictionary python index list r...,python,data-structures,hash,immutability,


In [9]:
print(sum(df["Body_clean"].isna()))
df[df["Body_clean"].isna()][["Title", "Title_clean", "Body", "Body_clean", "Code"]]

50


Unnamed: 0,Title,Title_clean,Body,Body_clean,Code
5,What and where are the stack and heap?,stack heap,<ul>\n<li>What are the stack and heap?</li>\n<...,,
168,What is The Rule of Three?,rule,<ul>\n<li>What does <em>copying an object</em>...,,
828,Why are these constructs using pre and post-in...,construct pre behavior,<pre><code>#include &lt;stdio.h&gt;\n\nint mai...,,#include <stdio.h>\n\nint main(void)\n{\n in...
1207,How to use ADB Shell when Multiple Devices are...,use adb shell multiple device connect fail err...,<pre><code>$ adb --help\n</code></pre>\n<hr />...,,$ adb --help\n -s SERIAL use device with give...
1640,What are the advantages of list initialization...,advantage list initialization brace,<pre><code>MyClass a1 {a}; // clearer and ...,,MyClass a1 {a}; // clearer and less error-...
2129,How to use GROUP BY to concatenate strings in ...,use group concatenate string sql server,<p>How do I get:</p>\n\n<pre><code>id Na...,,id Name Value\n1 A ...
2242,curl: (60) SSL certificate problem: unable to ...,curl ssl certificate problem certificate,<pre><code>root@sclrdev:/home/sclr/certs/Fresh...,,root@sclrdev:/home/sclr/certs/FreshCerts# curl...
3014,Picasso v/s Imageloader v/s Fresco vs Glide vs...,picasso v s imageloader v s fresco vs glide vs...,<h2>Findings:</h2>\n<ol>\n<li>Difference betwe...,,
5990,Algorithm to implement a word cloud like Wordle,algorithm implement word cloud like wordle,<h2>Context</h2>\n\n<ul>\n<li>Take a look at W...,,
8810,Checking if array is multidimensional or not?,check array,<ol>\n<li>What is the most efficient way to ch...,,is_array()


In [10]:
df.shape[0]

118442

In [11]:
df["Title_clean"] + df["Body_clean"]

0         process array process arraypiece c++ code show...
1         undo commit gitcommit file git push commit ser...
2         delete git branchdelete remote origin bugfix b...
3         difference git pull git fetchdifference git pu...
4         operator c++read hidden features dark corners ...
                                ...                        
118437    nsstring boundingrectwithsize cut height coret...
118438    mvxbind error view type find fragmenttry add m...
118439    marker show iecreate google maps application s...
118440    javamail ntlm failtry send email java network ...
118441    understand objectlook question like find throw...
Length: 118442, dtype: object

In [12]:
df["Token"] = df["Title_clean"].fillna("") + df["Body_clean"].fillna("")
LONG = False
for min_df in [0.01, 0.05, 0.005, 0.0005, 0.00001] if LONG else [0.00001]:
    for max_df in [0.95, 0.995, 0.9995] if LONG else [0.995]:
        print(f"\n{'*'*30}\n for min_df:{min_df}; max_df:{max_df}")
        count_vectorizer = feature_extraction.text.CountVectorizer(
            min_df=min_df, max_df=max_df, max_features=None
        )

        # title_vectors = count_vectorizer.fit_transform(df["Title_clean"].fillna(""))
        # body_vectors = count_vectorizer.fit_transform(df["Body_clean"].fillna(""))
        # print(
        #    f"\ttokens unique:\n\t\t{title_vectors.shape[1]} dans title\n\t\t{body_vectors.shape[1]} dans body"
        # )
        token_vectors = count_vectorizer.fit_transform(df["Token"])
        print(
            f"\ttokens unique:\n\t\t{token_vectors.shape[1]} dans title et body regroupé"
        )


******************************
 for min_df:1e-05; max_df:0.995
	tokens unique:
		53999 dans title et body regroupé


In [13]:
# title_tfid_transformer = feature_extraction.text.TfidfTransformer(
#     norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False
# ).fit(title_vectors)
# title_tfid = title_tfid_transformer.transform(title_vectors)
# title_tfid.todense()

In [14]:
# body_tfid = feature_extraction.text.TfidfTransformer(
#     norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False
# ).fit_transform(body_vectors)
# body_tfid.todense()

In [15]:
tfidf = feature_extraction.text.TfidfTransformer(
   norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False
).fit_transform(token_vectors)

In [16]:
tfidf[0:100]

<100x53999 sparse matrix of type '<class 'numpy.float64'>'
	with 1607 stored elements in Compressed Sparse Row format>

In [17]:
[text for text in [token for token in tfidf[0:100]]]

[<1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	with 36 stored elements in Compressed Sparse Row format>,
 <1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	with 8 stored elements in Compressed Sparse Row format>,
 <1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	with 6 stored elements in Compressed Sparse Row format>,
 <1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	with 4 stored elements in Compressed Sparse Row format>,
 <1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	with 23 stored elements in Compressed Sparse Row format>,
 <1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	with 15 stored elements in Compressed Sparse Row format>,
 <1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	with 6 stored elements in Compressed Sparse Row format>,
 <1x53999 sparse matrix of type '<class 'numpy.float64'>'
 	w

In [18]:
[token for token in df["Token"]]

['process array process arraypiece c++ code show behavior reason sort datum region make loop time sort take time pass array need calculate array think language compiler anomaly try java result thought sorting bring datum cache array generate code sum term order matter relate follow q&a effect compiler option',
 'undo commit gitcommit file git push commit server undo commit repository',
 'delete git branchdelete remote origin bugfix branch',
 'difference git pull git fetchdifference git pull git fetch',
 'operator c++read hidden features dark corners c++/stl comp.lang.c++.moderate follow snippet compile work visual studio g++ code output assume c work gcc define standard come',
 'stack heap',
 'force git pull overwrite fileforce overwrite file git pull repository contain file filename server error working tree file example.txt overwrite merge',
 'check element hide jquerytoggle visibility element .show test element',
 'use javascript reasoningrun javascript code crockford jslint give fo

In [19]:
#corpora.Dictionary(token["Token"])

In [20]:
def tag_is_in(df: pd.DataFrame(), tag: str, nb_cols: int = 6):
    return eval("|".join(f'(df["{i}"] == "{tag}")' for i in range(0, nb_cols)))

def tags_are_in(df: pd.DataFrame(), tags: list):
    return pd.DataFrame({tag:tag_is_in(df,tag) for tag in tags})

# tags_are_in(df, ["javascript","java"])
tags_are_in(df[10001:15050], ["javascript","java","c#","python","html","git","php"])

Unnamed: 0,javascript,java,c#,python,html,git,php
10001,True,False,False,False,True,False,False
10002,False,True,False,False,False,False,False
10003,False,False,True,False,False,False,False
10004,False,False,False,False,True,False,False
10005,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
15045,False,False,False,False,False,False,False
15046,False,False,False,False,False,False,False
15047,True,False,False,False,False,False,False
15048,False,False,False,False,False,False,False


In [21]:
OvsRestAll = multiclass.OneVsRestClassifier(svm.SVC(random_state=42)).fit(
    tfidf[0:10000], tags_are_in(df[0:10000], ["javascript","java","c#","python","html","git","php"])
)

In [22]:
pred = OvsRestAll.predict(tfidf[10001:15050])
pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
len(pred)

5049

In [24]:
target_names = ["javascript","java","c#","python","html","git","php"]
print(metrics.classification_report(tags_are_in(df[10001:15050], target_names), pred , target_names=target_names, zero_division=0)) #zero_division for the warning

              precision    recall  f1-score   support

  javascript       0.82      0.45      0.58       564
        java       0.85      0.46      0.60       584
          c#       0.74      0.15      0.26       544
      python       0.94      0.59      0.73       479
        html       0.76      0.29      0.42       259
         git       0.90      0.70      0.79        74
         php       0.88      0.27      0.41       200

   micro avg       0.85      0.40      0.54      2704
   macro avg       0.84      0.42      0.54      2704
weighted avg       0.83      0.40      0.52      2704
 samples avg       0.21      0.20      0.20      2704



> - Recall: The ability of a model to find all the relevant cases within a data set. Mathematically, we define recall as the number of true positives divided by the number of true positives plus the number of false negatives. 
> - Precision: The ability of a classification model to identify only the relevant data points

In [25]:
# OvsRClf_js = multiclass.OneVsRestClassifier(svm.SVC(random_state=42)).fit(
#     body_tfid[0:5000], tag_is_in(df[0:5000], "javascript")
# )

In [26]:
# pred = OvsRClf_js.predict(body_tfid[5000:6050])
# pred

In [27]:
# tag_is_in(df[5000:6050], "javascript")

In [28]:
# def evaluate_pred(reels=tag_is_in(df[5000:6050], "javascript"), preds=pred):
def evaluate_pred(reels, preds):
    fauxVrai = 0
    vraiFaux = 0
    bonVrai = 0
    bonFaux = 0
    bon = 0
    faux = 0
    for reel, pred in zip(reels, preds):
        # print(a, b)
        # print("test", a == b)
        if reel and pred:
            bonVrai += 1
        if reel and (not pred):
            fauxVrai += 1
        if (not reel) and pred:
            vraiFaux += 1
        if (not reel) and (not pred):
            bonFaux += 1
        if reel == pred:
            bon += 1
        else:
            faux += 1
    print(f"taille {len(preds)}")
    print(
        f"predit faux alors que vrai:{fauxVrai}\npredit vrai alors que faux:{vraiFaux}\nbonVrai:{bonVrai}(/{sum(preds)}) bonFaux:{bonFaux}(/{len(preds) - sum(preds)})"
    )
    print(f"bon:{bon} faux:{faux}")
    # print(f"accuracy:{(bonVrai+bonFaux)/(fauxVrai+vraiFaux):.2%}")
    print(f"Accuracy:{(bon)/(len(preds)):.2%}")
    # print(f"Recall: {bonVrai/(bonVrai+vraiFaux):.2%}")
    print(
        f"Recall: {bonFaux/(len(preds) - sum(preds)):.2%}"
    )  # -> 100% Recall: No false negatives, every negative prediction is correct.
    print(
        f"Precision: {bonVrai/sum(preds):.2%}"
    )  # -> 100% Precision: No false positives, every positive prediction is correct
    # print(f"Precision2: {bonVrai/(bonVrai+fauxVrai):.2%}")


# evaluate_pred(reels=tag_is_in(df[5000:6050], "javascript"), preds=pred)

In [29]:
#OvsRClf_title_js = multiclass.OneVsRestClassifier(svm.SVC(random_state=42)).fit(
#    title_tfid[0:1000], tag_is_in(df[0:1000], "javascript")
#)

In [30]:
#pred_title_js = OvsRClf_title_js.predict(title_tfid[1000:1050])

In [31]:
#evaluate_pred(reels=tag_is_in(df[1000:1050], "javascript"), preds=pred_title_js)

In [32]:
# def OvR_train(
#     start: int = 0, end: int = 50000, tag: str = "java", to_pred=(50001, 100000)
# ):
#     reels = tag_is_in(df[start:end], tag)
#     OvsRClf_body = multiclass.OneVsRestClassifier(svm.SVC(random_state=42)).fit(
#         body_tfid[start:end], reels
#     )
#     OvsRClf_title = multiclass.OneVsRestClassifier(svm.SVC(random_state=42)).fit(
#         title_tfid[start:end], reels
#     )
#     return {
#         "pred_body": OvsRClf_body.predict(body_tfid[to_pred[0] : to_pred[1]]),
#         "pred_title": OvsRClf_title.predict(title_tfid[to_pred[0] : to_pred[1]]),
#         "reels_p": tag_is_in(df[to_pred[0] : to_pred[1]], tag),
#     }


# pred_java = OvR_train(start=0, end=10000,tag="java", to_pred=(10001,100000))

In [33]:
# evaluate_pred(reels=tag_is_in(df[10001:100000], "java"), preds=pred_java["pred_body"])

In [34]:
# evaluate_pred(reels=tag_is_in(df[10001:100000], "java"), preds=pred_java["pred_title"])

3 approches de Word/Sentence Embedding : Word2Vec (ou Doc2Vec, Glove…), BERT et USE. 

https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation