In [1]:
# Python and WSGI libraries
from flask import Flask, render_template, request, send_file
from werkzeug.utils import secure_filename
import warnings
warnings.filterwarnings("ignore")      # to neglect warning messages

app = Flask(__name__)       

@app.route('/')
def upload():
    return render_template("upload-docx.html")        # default html to get data from the user into a html form.

@app.route('/select_model', methods = ['GET', 'POST']) # gets form POST data from html form into the backend for processing.
def nlp_algos():                                      # function enables the data preprocessing and generates vector space on the input file.
    if request.method == 'POST':
        f = request.files['document']                 # the input file is captured and saved on the system using WERKZEUG toolkit.
        f.save(secure_filename(f.filename))
        loc = '/Users/anshulgoyal/Downloads/Nowigence/' + f.filename   # location of the input file
        loc = loc.replace(" ","_")                                     # adding the changes made in the filename on the system due to the toolkit
        
        # python libraries
        from docx import Document
        from joblib import load, dump
        import pandas as pd
        from nltk import word_tokenize
        from nltk.stem.porter import PorterStemmer
        from nltk.stem import WordNetLemmatizer
        import re
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import HashingVectorizer
        from nltk.tokenize.treebank import TreebankWordDetokenizer
        import gensim
        from gensim import models
        from gensim.matutils import softcossim
        from gensim import corpora
        import gensim.downloader as api
        from gensim.utils import simple_preprocess
        from gensim.models.word2vec import Word2Vec
        stemmer = PorterStemmer()                                         # Using NLTK library
        lemmatizer = WordNetLemmatizer()                                  # Using NLTK library

        # model = api.load('glove-wiki-gigaword-300')                       # Using GENSIM library, GLOVE embeddings of wikipedia dataset with 300 dimensions. ###### Time complexity to load ~ 3 mins (observed)
        # fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')   # Using GENSIM library, fastText embeddings of wikipedia news with 300 dimensions. ###### Time complexity to load ~ 7 mins (observed)
        
        def read_files(path):                                             # here we take the input docx file with desired constraints to perform NLP tasks to get vector spaces as testing data.
            document = Document(path)                                     # docx file is read from the system location which is passed when this function is called.
            doc = []                                                      # local scope list to capture paragraphs
            for para in document.paragraphs:                              # code to remove paragraphs shorter than 201 characters
                if len(para.text) > 201:
                    doc.append(para.text)
            df = pd.DataFrame(doc, columns=['sent'])                      # "df" dataframe to perform next task
            return df
        
        def tokenize(x, stem=False, lemma=False):                         # here the tokenised data is filtered from any unnecessary characters.
            x = re.sub(r'(?:<[^>]+>)', '', x)                             # Using Regular Expression
            pattern = '0-9 $ ` % {} <> -,: _ \ . = +| /'
            x = re.sub(r'(pattern)', '', x)
            tokens = word_tokenize(x)                                     # tokenizing step, word tokenizer called from the NLTK library 
            if stem:                                                      # stemming step, Porter stemmer called from the NLTK library
                tokens = [stemmer.stem(t) for t in tokens]
            if lemma:                                                     # lemmatizing step, Wordnet lemmatizer called from the NLTK library
                tokens = [lemmatizer.lemmatize(t) for t in tokens]
            return tokens                                                 # returns tokens to the preprocess function
        
        def preprocess(df, stem=False, lemma=False):                      # here we pass which root word algorithm to perform in the data preprocessing task
            tokens = []
            if stem:
                tokens = df.sent.apply(lambda x: tokenize(x, True))        # Calls the TOKENIZE function
            if lemma:
                tokens = df.sent.apply(lambda x: tokenize(x, False, True)) # Calls the TOKENIZE function
            return tokens                                                 # this returns the preprocessed tokens for next step
        
        def cosine(tokens, stem=False, lemma=False):                      # here we apply cosine similarity using 2 Vectorizer's as mentioned below
            cosout_tfidf = []
            cosout_cv = []
            tokens = tokens.apply(lambda x: ' '.join(x))                  # tokens are joined to text again
            for count in range(0, len(tokens) - 1):                       # for loop to create Bag of Words and apply Cosine Similarity 
                sent1 = tokens[count]
                sent2 = tokens[count + 1]
                documents = [sent1, sent2]                                # list of items to be compared to get vector spaces.
                tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))   # TfIdf-Vectorizer from SKLEARN library over Bag of Words ranging 1 to 3 (unigram till trigram)
                tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
                tfidf = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)                     # Cosine Similarity from SKLEARN library over TF-IDF values
                cosout_tfidf.append(tfidf[0][1])
                count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))   # Count-Vectorizer from SKLEARN library over Bag of Words ranging 1 to 3 (unigram till trigram)
                sparse_matrix = count_vectorizer.fit_transform(documents)
                cv = cosine_similarity(sparse_matrix[0:1], sparse_matrix)                      # Cosine Similarity from SKLEARN library over Count Vectorize values
                cosout_cv.append(cv[0][1])
            global df_cosine                                                                   # dataframe made gloablly available for future use and debugging
            if stem:
                df_cosine = pd.DataFrame(list(zip(cosout_tfidf, cosout_cv)),
                                  columns=['stem_cosout_tfidf', 'stem_cosout_cv'])
            if lemma:
                df_cosine = pd.DataFrame(list(zip(cosout_tfidf, cosout_cv)),
                                  columns=['lemma_cosout_tfidf', 'lemma_cosout_cv'])
            return(df_cosine)
        
        def glove_sofcos(tokens, stem=False, lemma=False):               # here we apply Soft cosine similarity using GLOVE embeddings
            glsoftcosout = []
            tokens = tokens.apply(lambda x: ' '.join(x))
            for count in range(0, len(tokens)-1):
                sent1 = tokens[count]
                sent2 = tokens[count+1]
                documents = [sent1, sent2]
                dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents]) # GENSIM library is used to get corpora, simple_preprocess functions
                similarity_matrix = model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0,
                                                                        nonzero_limit=100)
                sent_1 = dictionary.doc2bow(simple_preprocess(sent1))
                sent_2 = dictionary.doc2bow(simple_preprocess(sent2))
                soft_cosine_output = softcossim(sent_1, sent_2, similarity_matrix)             # Using GENSIM library, softcossim is called
                glsoftcosout.append(soft_cosine_output)
            global df_glsoftcos
            if stem:
                df_glsoftcos = pd.DataFrame(glsoftcosout, columns = ['stem_Glove_Soft_Cos'])
            if lemma:
                df_glsoftcos = pd.DataFrame(glsoftcosout, columns = ['lemma_Glove_Soft_Cos'])
            return(df_glsoftcos)
        
        def fasttext_sofcos(tokens, stem=False, lemma=False):           # here we apply Soft cosine similarity using fastTEXT embeddings
            softcosout = []
            tokens = tokens.apply(lambda x: ' '.join(x))
            for count in range(0, len(tokens)-1):
                sent1 = tokens[count]
                sent2 = tokens[count+1]
                documents = [sent1, sent2]
                dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])
                similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0,
                                                                        nonzero_limit=100)
                sent_1 = dictionary.doc2bow(simple_preprocess(sent1))
                sent_2 = dictionary.doc2bow(simple_preprocess(sent2))
                soft_cosine_output = softcossim(sent_1, sent_2, similarity_matrix)            # Using GENSIM library, softcossim is called    
                softcosout.append(soft_cosine_output)
            global df_softcos    
            if stem:
                df_softcos = pd.DataFrame(softcosout, columns = ['stem_fastText_Soft_cosine'])
            if lemma:
                df_softcos = pd.DataFrame(softcosout, columns = ['lemma_fastText_Soft_cosine'])
            return(df_softcos)
        
        def run(stem=False, lemma=False):
            global df_doc, x, y, z      # global scope for future use in next app.route and debugging purpose
            df_doc = read_files(loc)    # Calls the READ_FILES function passing the file path as 'loc' ----- value in 'loc' was captured just after the input file was saved using secure_filename.
            tokens = preprocess(df_doc, stem=stem, lemma=lemma)     # Calls the PREPROCESS function, while passing df_doc to the function
            x = cosine(tokens, stem=stem, lemma=lemma)              # Calls the COSINE function, while passing tokens to the function
            # y = fasttext_sofcos(tokens, stem=stem, lemma=lemma)     # Calls the FASTTEXT_SOFCOS function, while passing tokens to the function                           ## time_complexity ~ 10 mins for data size of 20 rows (observed)
            # z = glove_sofcos(tokens, stem=stem, lemma=lemma)        # Calls the GLOVE_SOFCOS function, while passing tokens to the function                           ## time_complexity ~ 05 mins for data size of 20 rows (observed)
        
        def run_all():
            global result               # global scope for future use in next app.route and debugging purpose
            result = pd.DataFrame()
            run(stem=True, lemma=False)                       # Calls the RUN function selecting stemming
            result = pd.concat([x], axis=1, sort=False)   # Since only using x - "result = pd.concat([x, y, z], axis=1, sort=False)"
            run(stem=False, lemma=True)                       # Calls the RUN function selecting lemmatizing
            result = pd.concat([result, x], axis=1, sort=False)    # change when using y and z also...    
        run_all()                ### executing the RUN_ALL function, once the input file is saved.
        return render_template("select-model.html")           # redirects the user to select-model.html for getting which ML model to be performed. 

    
@app.route('/output', methods = ['GET', 'POST'])       # gets form POST data from html form into the backend for ML modeling.
def ml_models():
    if request.method == 'POST':
        model = request.form.get('Model')              # we get the choice made by the user as 'model'
        import joblib
        import pandas as pd
        import numpy as np
        import time
        
####### NOTE:- PICKLED MODEL NEEDS TO BE UPDATED FROM THE ML_MODEL_PYTHON CODE FILE IF GLOVE AND FASTTEXT EMBEDDINGS ARE INCLUDED. 
        
        if model == 'model':                           # when user doesnot selects any model, the html throws exception and flask redirects the user to the make a choice again.
            return render_template("select-model.html")   
        if model == 'lasso':
            model_name = joblib.load("/Users/anshulgoyal/Downloads/Nowigence/lasso_model.pkl")      # calling lasso model using pickled file placed in the same directory where the flask code exists. Likewise other models
        if model == 'ridge':
            model_name = joblib.load("/Users/anshulgoyal/Downloads/Nowigence/ridge_model.pkl")            
        if model == 'tree':
            model_name = joblib.load("/Users/anshulgoyal/Downloads/Nowigence/decision_tree_model.pkl")            
        if model == 'perceptron':
            model_name = joblib.load("/Users/anshulgoyal/Downloads/Nowigence/perceptron_model.pkl")            
        if model == 'logit':
            model_name = joblib.load("/Users/anshulgoyal/Downloads/Nowigence/logistic_regression_model.pkl")            
        if model == 'nBayes':
            model_name = joblib.load("/Users/anshulgoyal/Downloads/Nowigence/naive_bayes_model.pkl")            
        if model == 'forest':
            model_name = joblib.load("/Users/anshulgoyal/Downloads/Nowigence/random_forest_model.pkl")
        ### global scope variables created for other app route access to the information saved in these variables
        global predictions, df, output, timestr   
        predictions = model_name.predict(result)  # Calls predict function existing in the pickled model.
        if model == 'lasso':                                    ## specific to the lasso model
            predictions = np.where(predictions > 0.5, 1, 0)     ## To convert Probabilities.
        master_list = []                                        ## List created to print the paragraphs into the dataframe
        for index, rows in df_doc.iterrows():                   ## df_doc is read to get the original text in the input file given by the user.
            mylist1 = rows.sent
            master_list.append(mylist1)
        ### gloabl scope dataframe used for creating another view later to display sections.
        df = pd.DataFrame({'S/N': range(1,len(predictions)+1),'Paragraph':master_list[0:len(predictions)], 'Compared to':master_list[1:len(predictions)+1], 'Section': predictions}) 
        timestr = time.strftime("%Y-%m-%d_%H-%M-%S")            ## time stamp assigned to uniquely identify the output generated by the models.
        df.to_csv('output.'+ timestr, index=False)              ## unique name assigned for future download services
        return render_template('third.html',  tables = [df.to_html(classes='data', header=True, index=False)])
    
@app.route('/download_output', methods = ['GET', 'POST'])   ## DOWNLOAD SERVICE CODE
def download_output():                                      ## once user selects to download CSV file this function traces the file and returns it to the end-user using send_file function
    if request.method == 'POST':
        path = '/Users/anshulgoyal/Downloads/Nowigence/' + 'output.'+ timestr
        return send_file(path, mimetype='text/csv', as_attachment=True)

@app.route('/section_view', methods = ['GET', 'POST'])    ## to avail another view of the dataframe "df" generated using the predictions.
def para_view():                                          ## When user selects to view SETIONIZED VIEW on the webpage
    if request.method == 'POST': 
        import pandas as pd
        l = []                                            # local scope list 'l' to append section boundary in the dataframe
        for i in range(len(df)):
            if df.loc[i,'Section'] == 0:                  # if prediction column is ZERO
                l.append(df.loc[i,'Paragraph'])
            else:                                         # if prediction column is ONE
                l.append(df.loc[i,'Paragraph'])
                l.append("-    - - - !!! - - -    -")     #######   SECTION DIVIDER STRING   #######
        df2 = pd.DataFrame({'SECTION': l})
        df2.to_csv('section.'+ timestr, index=False)
        return render_template('final.html',  tables = [df2.to_html(classes='data', header=True, index=False)])

@app.route('/download_section', methods = ['GET', 'POST'])  ## DOWNLOAD SERVICE CODE
def download_section():                                     ## once user selects to download CSV file this function traces the file and returns it to the end-user using send_file function
    if request.method == 'POST':    
        path = '/Users/anshulgoyal/Downloads/Nowigence/' + 'section.'+ timestr
        return send_file(path, mimetype='text/csv', as_attachment=True)         

if __name__ == '__main__':
    app.run(debug = False)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [13/May/2020 15:26:36] "[37mGET / HTTP/1.1[0m" 200 -
unable to import 'smart_open.gcs', disabling that module
127.0.0.1 - - [13/May/2020 15:26:59] "[37mPOST /select_model HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 15:27:02] "[37mPOST /output HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 15:27:10] "[37mPOST /section_view HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 15:27:10] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [13/May/2020 15:27:12] "[37mPOST /download_section HTTP/1.1[0m" 200 -
127.0.0.1 - - [13/May/2020 15:27:19] "[37mPOST /download_output HTTP/1.1[0m" 200 -
