<a href="https://colab.research.google.com/github/AnthonyNama/Categoriser-automatiquement-des-questions/blob/master/P5_03_notebookApi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import re
import nltk
import spacy
import pickle
import sklearn
from joblib import load
from numpy import loadtxt

import xgboost as xgb
from abbreviations import schwartz_hearst
#from gensim.corpora.dictionary import Dictionary
from flask import Flask, render_template, url_for, request

```
preprocessing
```

In [0]:
nlp = spacy.load('en_core_web_sm', disable=["tagger", "parser", "ner", "textcat"])

In [0]:
def lowercase(val):
    return val.lower().strip()

# Change the abbreviations by their full value
def abbreviation(text):
    paires = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=text)
    keys = [k for k, v in paires.items()]
    document = nlp(text)
    words = []
    for token in document:
        if token.text in keys and  token.text in mean_abbr:
            words.append(paires[token.text])
        else:
            words.append(token.text)

    return ' '.join(words)
    
def remove_contract_form(text):
    document = nltk.word_tokenize(text) #nlp(text)
    words = []
    for token in document:
        text = token
        text = re.sub(r"\'m", "am", text)
        text = re.sub(r"\'re", "are", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"n't", "not", text)
        text = re.sub(r"\'ve", "have", text)
        text = re.sub(r"\'d", "would", text)
        text = re.sub(r"\'ll", "will", text)
        words.append(text)
    return ' '.join(words)

# Delete STOP WORDS
def del_stop_words(text):
    document = nlp(text)
    words = [token.text for token in document if not token.is_stop]
    return ' '.join(words)

# Delete punctuation
def del_punct(text):
    document = nlp(text)
    words = [token.text for token in document if not token.is_punct]
    return ' '.join(words)

# Lemmatization
def lemmatization(text):
    document = nlp(text)
    words = [token.lemma_ for token in document if not token.is_punct]
    return ' '.join(words)

# Remove special characters and bad symbols
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;-_+]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #]')
def clean_spec_bad(text):
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub(' ', text)
    return text

def remove_numbers(text):
    document = nlp(text)
    words = [token.text for token in document if not token.text.isdigit()]
    return ' '.join(words)

def delete_multiple_space(text):
    return ' '.join(text.split())

# Delete string of length = 1 and not in tags
def low_length(text):
    document = nlp(text)
    words = []
    for token in document:
        if len(token.text) > 1:
            words.append(token.text)
        else:
            if token.text in tags:
                words.append(token.text) 
    return ' '.join(words)

def remove_numbers_from_string(text):
    document = nlp(text)
    words = []
    for token in document:
        words.append(''.join([i for i in token.text if not i.isdigit()]))
    return ' '.join(words)

In [0]:
def preprocessing(text):
    text = lowercase(text)
    text = abbreviation(text)
    text = remove_contract_form(text)
    text = clean_spec_bad(text)
    text = del_stop_words(text)
    text = remove_numbers_from_string(text)
    text = delete_multiple_space(text)
    text = low_length(text)
    text = lemmatization(text)
    return text

```
Save and load object
```

In [0]:
def save_obj(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [0]:
tags = load_obj('tags')
mean_abbr = load_obj('mean_abbr')

In [0]:
vectorizer = load_obj('vectorizer')

In [0]:
xgb_model = load_obj('model')

In [0]:
dictionary = load_obj('dictionary')

2020-03-19 16:58:14,721 : INFO : 'pattern' package found; tag filters are available for English


In [0]:
lda_model = load_obj('lda_model')

```
Constance
```

In [0]:
n_labels = 3

In [0]:
no_top_words = 10

```
Prediction function and sub-functions
```

In [0]:
def display_keywords(model, no_top_words, num_topic, num_topics=30):
    for idx, topic in model.show_topics(formatted=False, num_topics=num_topics, num_words= no_top_words):
        if idx == num_topic:
            return " ".join([w[0] for w in topic])

In [0]:
def num_topic(topic_pro):
    max = 0
    nu_top = -1
    for t_p in topic_pro:
        if t_p[1] >= max:
            max = t_p[1]
            nu_top = t_p[0]
    return nu_top

In [0]:
def prediction(body, title):
    c= []
    text = preprocessing(body + " " + title)
    c.append(text)
    X_bt = vectorizer.transform(c)
    input_lda = dictionary.doc2bow(text.split())

    y_preds_pro = xgb_model.predict_proba(X_bt)

    y_preds = y_preds_pro[0].argsort()[-n_labels:][::-1]

    idx_topic = num_topic(lda_model.get_document_topics(input_lda))

    keywords = display_keywords(lda_model, no_top_words, idx_topic)

    labels = []
    for i in y_preds:
        labels.append(xgb_model.classes_[i])

    print("The keywords proposed : {}".format(keywords))
    print("The tags proposed : {}".format(labels))
    return (keywords, labels)

```
Building API
```

In [0]:
app = Flask(__name__)
 
@app.route('/', methods = ['GET','POST'])
def main():
    return """<!DOCTYPE html>
                <html>
                    <head>
                         <title>Catégoriser auto questions</title>
                    </head>
                    <body>
                        <div align="center" class="bg-info">
                            <h1>Stack Overflow predict keywords-tags</h1>
                        </div>
                        <div class="big" align="center">
                            <form action="predict" method="POST">
                                <h3>Enter Your Title</h3>
                                <textarea name="title" rows="1" cols="70"></textarea>
                                <br>
                                <h3>Enter Your Body</h3>
                                <textarea name="body" rows="20" cols="70"></textarea>
                                   <br><br><br>
                               <input type="submit" name="" value="Predict" class="btn btn-info">
                              </form>
                         </div>
                    </body>
                </html>
                """

@app.route('/predict', methods = ['POST'])
def predict():
    title = ""
    body = ""
    if request.method == 'POST':
        title = str(request.form['title'])
        body = str(request.form['body'])
        ret = preprocessing( body + title)
    keywords, tags = prediction(body, title)
    keys = keywords.split()
    return """
            <!DOCTYPE html>
            <html>
                <head>
                     <title>Catégoriser auto questions</title>
                </head>
                <body>
                    <div class="big" align="center">
                                <h4>Title entered</h4>
                                <textarea name="title" rows="1" cols="100"disabled>"""+ title +"""</textarea>
                                <br>
                                <h4>Body entered</h4>
                                <textarea name="body" rows="7" cols="100" disabled>"""+ body +"""</textarea>
                                   <br><br>
                    </div>
                     <div class="bg-info" align="center">
                         <h2>The keywords proposed</h2>
                     </div>
                     <div align="center">
                            <textarea rows="1" cols="10" disabled>"""+ keys[0] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[1] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[2] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[3] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[4] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[5] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[6] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[7] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[8] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>"""+ keys[9] +"""</textarea>
                          <h2>The tags proposed</h2>
                          <textarea rows="1" cols="10" disabled>#"""+ tags[0] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>#"""+ tags[1] +"""</textarea>
                          <textarea rows="1" cols="10" disabled>#"""+ tags[2] +"""</textarea>
                     </div>
                     <br><br><br>
                     <form action="/" method="POST" align="center">
                         <input type="submit" name="" value="Try again" class="btn btn-info">
                     </form>
                </body>
            </html>
        """
if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


2020-03-19 17:11:48,167 : INFO :  * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
2020-03-19 17:11:49,590 : INFO : 127.0.0.1 - - [19/Mar/2020 17:11:49] "[37mGET / HTTP/1.1[0m" 200 -
2020-03-19 17:13:03,149 : INFO : 127.0.0.1 - - [19/Mar/2020 17:13:03] "[37mPOST /predict HTTP/1.1[0m" 200 -


The keywords proposed : library object event create formula task directive use export component
The tags proposed : ['javascript', 'c++', 'python']
