In [1]:
#download glove embeddings for summarization part
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip

import re 
import string
import pandas as pd
import numpy as np
import os
from os import path

from IPython.display import clear_output

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
plt.style.use('dark_background')
import plotly.express as px
import plotly.graph_objects as go

from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist

import networkx as nx

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

import gensim
from gensim import corpora

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity

from wordcloud import WordCloud

from ipywidgets import AppLayout, Button, Layout, FileUpload, HBox, Text, VBox, Tab, widgets, Output, Accordion

import warnings
warnings.filterwarnings('ignore')

class NLP_app:
    def __init__(self):
        #first tab
        self.file_upload = FileUpload(accept='*.txt', multiple=False, description='Upload a file')
        self.file_upload.style.button_color = 'lightblue'
        self.preview_file = Button(description='Preview file')
        self.preview_file.style.button_color = 'lightblue'
        self.preview_file.on_click(self.on_preview_file)
        self.clean_preview = Button(description='Clear preview')
        self.clean_preview.style.button_color = 'salmon'
        self.clean_preview.on_click(self.clean_your_screen)
        
        self.stats = Button(description='Base statistics')
        self.stats.style.button_color = 'lightblue'
        self.stats.on_click(self.on_stats)
        self.freq = Button(description='Frequency distribution')
        self.freq.style.button_color = 'lightblue'
        self.freq.on_click(self.on_freq)
        self.poss = Button(description='Parts of speech')
        self.poss.style.button_color = 'lightblue'
        self.poss.on_click(self.on_poss)
        
        #second tab
        self.apply_cloud = Button(description='Generate a cloud')
        self.apply_cloud.style.button_color = 'lightblue'
        self.apply_cloud.on_click(self.on_apply_cloud)
        
        #third tab
        self.apply_topic = Button(description='Generate the topic')
        self.apply_topic.style.button_color = 'lightblue'
        self.apply_topic.on_click(self.on_apply_topic)
        
        #fouth tab
        self.apply_summary = Button(description='Make a summary')
        self.apply_summary.style.button_color = 'lightblue'
        self.apply_summary.on_click(self.on_apply_summary)

        #fifth tab
        self.apply_sentiment = Button(description='Check sentiment')
        self.apply_sentiment.style.button_color = 'lightblue'
        self.apply_sentiment.on_click(self.on_apply_sentiment)
        
        #clean all
        self.clean_screen = Button(description='Clear all')
        self.clean_screen.style.button_color = 'tomato'
        
        self.clean_screen.on_click(self.clean_your_screen)
        
        self.output = Output()
    
        self.tab = Tab(layout=Layout(width='60%', height = '100%'))
        
        self.accordion = Accordion(children=[
                        self.file_upload, HBox([self.preview_file, self.clean_preview]), 
                        HBox([self.stats, self.freq, self.poss])])
        self.accordion.set_title(0, 'Input file')
        self.accordion.set_title(1, 'File preview')
        self.accordion.set_title(2, 'Statistics')
        
        self.accordion_box = VBox([self.accordion, self.output])
        self.word_cloud = VBox([self.apply_cloud, self.output]) 
        self.topic = VBox([self.apply_topic, self.output])
        self.summary = VBox([self.apply_summary, self.output])
        self.sentiment = VBox([self.apply_sentiment, self.output])
        
        self.children = [
            self.accordion_box,
            self.word_cloud,
            self.topic,
            self.summary,
            self.sentiment
        ]
        
        self.tab.children = self.children
        self.tab.set_title(0, "Upload")
        self.tab.set_title(1, "Word cloud")
        self.tab.set_title(2, "Topic modeling")
        self.tab.set_title(3, "Text summary")
        self.tab.set_title(4, "Text sentiment")
        
        self.container = VBox([self.tab, self.clean_screen])

    # Functions
    def preview(self, file_upload):
        """Function to make content preview"""
        for i in self.file_upload.value:
            content = self.file_upload.value[i]['content'].decode("utf-8") 
            return print(content)
    
    def text_stats(self, file_upload):
        """Function to show base statistics of the uploaded text"""
        for i in self.file_upload.value:
            content = self.file_upload.value[i]['content'].decode("utf-8").lower()
        tokenized_word = word_tokenize(content) 

        length = len(content) #num of chars
        res = len(re.findall(r'\w+', content)) #num of words
        un = len(set(re.findall(r'\w+', content))) #num of unique words

        stops = [] #num of stop words
        for i in tokenized_word:
            if i in STOP_WORDS and i not in stops:
                stops.append(i)

        df = pd.DataFrame({'Chars': length, 'Words': res,
                           'Unique words': un, 'Stop words': len(stops) }, index=[0])
        
        layout = go.Layout(title='Text statistics', height=300)
        fig = go.Figure(data=[go.Table(
                        header=dict(values=list(df.columns),
                                    line_color='darkslategray',
                                    fill_color='lightskyblue',
                                    align='left'),
                        cells=dict(values=[df.Chars, df.Words, df['Unique words'], df['Stop words']],
                                   line_color='darkslategray',
                                   fill_color='white',
                                   align='left'))
                        ], layout=layout)
        fig.show() 
       
    def freqs(self, file_upload):
        """Function to show word frequency in the uploaded text"""
        for i in self.file_upload.value:
            content = self.file_upload.value[i]['content'].decode("utf-8").lower()
        tokenized_word = word_tokenize(content) 
        fdist = FreqDist(tokenized_word)
        df_fdist = pd.DataFrame(fdist.items(), columns=['Word', 'Frequency'])

        fig = px.scatter(df_fdist, x="Word", y="Frequency",
                    title="Frequency Distribution",
                    )
        fig.show()
    
    def posspeech(self, file_upload):
        """Function to show part of speech tags in the uploaded text"""
        for i in self.file_upload.value:
            content = self.file_upload.value[i]['content'].decode("utf-8").lower()
        tokenized_word = word_tokenize(content) 
        tags = nltk.pos_tag(tokenized_word)
        counts = Counter( tag for word,  tag in tags)

        df_pos = pd.DataFrame.from_dict(counts, orient='index').reset_index()
        df_pos = df_pos.rename(columns={'index':'POS', 0:'Count'})

        fig = px.scatter(df_pos, x="POS", y="Count",
                   title="Parts of speech")
        fig.show()
    
    def cloud(self, file_upload):
        """Function to generate a cloud"""
        for i in self.file_upload.value:
            content = self.file_upload.value[i]['content'].decode("utf-8") 
        wordcloud = WordCloud().generate(content)
        plt.figure(figsize=(14,8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off"); 
        plt.show()
    
    def topic_model(self, file_upload):
        """Function to create a topic"""
        for i in self.file_upload.value:
            content = self.file_upload.value[i]['content'].decode("utf-8")

        stop = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WordNetLemmatizer()

        def clean(doc):
            stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
            punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
            normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
            return normalized
        
        doc_complete = [content]
        doc_clean = [clean(doc).split() for doc in doc_complete]
        
        # Creating the term dictionary of our corpus, where every unique term is assigned an index.
        dictionary = corpora.Dictionary(doc_clean)
        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
        # Creating the object for LDA model using gensim library
        lda = gensim.models.LdaModel
        ldamodel = lda(doc_term_matrix, num_topics=1, id2word = dictionary, passes=50)
        x = ldamodel.print_topics(num_topics=5, num_words=1)
        y = (x[0][1])
        
        only_alpha = ""
        for char in y:
        # checking whether the char is an alphabet
            if char.isalpha():
                only_alpha += char    
        return print('The main topic of the text is: ' + only_alpha)
    
    def generate_summary(self, file_upload):
        """Function to generate short summary from the text"""
        for i in self.file_upload.value:
            content = self.file_upload.value[i]['content'].decode("utf-8") 
        sentences = tokenized_sent = sent_tokenize(content)
        
        #remove punctuations, numbers and special characters
        clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
        # make alphabets lowercase
        clean_sentences = [s.lower() for s in clean_sentences] 
        
        #Extract word vectors
        word_embeddings = {}
        f = open('glove.6B.100d.txt', encoding='utf-8') #using glove embeddings for training
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
        f.close()
        
        sentence_vectors = []
        for i in clean_sentences:
            if len(i) != 0:
                v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
            else:
                v = np.zeros((100,))
            sentence_vectors.append(v) 
        #similarity matrix
        sim_mat = np.zeros([len(sentences), len(sentences)])
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    
        #Page rank
        nx_graph = nx.from_numpy_array(sim_mat)
        scores = nx.pagerank(nx_graph)
        
        #Now, we extract the top N sentences based on their rankings for summary generation.
        ranked_sentences = sorted(((scores[i],s) for i, s in enumerate(sentences)), reverse=True)
        for i in range(3):
            print(ranked_sentences[i][1])
        
    def generate_sentiment(self, file_upload):
        """Function to generate sentiment of the uploaded text"""
        df = pd.read_csv('train/amazon_cells_labelled.txt', names=['review', 'sentiment'], sep='\t') 
        reviews = df['review'].values
        labels = df['sentiment'].values
        reviews_train, reviews_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=1000)

        punctuations = string.punctuation
        parser = English()
        stopwords = list(STOP_WORDS)
        def spacy_tokenizer(utterance):
            tokens = parser(utterance)
            return [token.lemma_.lower().strip() for token in tokens if token.text.lower().strip() not in stopwords and token.text not in punctuations]

        vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
        vectorizer.fit(reviews_train)

        X_train = vectorizer.transform(reviews_train)
        X_test = vectorizer.transform(reviews_test)

        classifier = LogisticRegression()
        classifier.fit(X_train, y_train)

        for i in self.file_upload.value:
            content = self.file_upload.value[i]['content'].decode("utf-8")
        new_reviews = []
        new_reviews.append(content)

        X_new = vectorizer.transform(new_reviews)
        
        prediction = classifier.predict(X_new)
        if prediction == [0]:
            plt.figure(figsize=(12,6))
            plt.title('The overall sentiment of the text is Negative')
            img = mpimg.imread('images/grumpy.jpeg')
            imgplot = plt.imshow(img)
            plt.axis("off");
            plt.show()
    
        elif prediction == [1]:
            plt.figure(figsize=(12,6))
            plt.title('The overall sentiment of the text is Positive')
            img = mpimg.imread('images/happy.jpg')
            imgplot = plt.imshow(img)
            plt.axis("off");
            plt.show()
       
    # Combining buttons with Functions
    def clean_your_screen(self, btn):
        with self.output:
            clear_output()
        
    def on_preview_file(self, btn):
        with self.output:
            clear_output()
            self.preview(self.file_upload)  
    
    def on_stats(self, btn):
        with self.output:
            clear_output()
            self.text_stats(self.file_upload)
    
    def on_freq(self, btn):
        with self.output:
            clear_output()
            self.freqs(self.file_upload)
    
    def on_poss(self, btn):
        with self.output:
            clear_output()
            self.posspeech(self.file_upload)
    
    def on_apply_cloud(self, btn):
        with self.output:
            clear_output()
            self.cloud(self.file_upload)
    
    def on_apply_topic(self, btn):
        with self.output:
            clear_output()
            self.topic_model(self.file_upload)
    
    def on_apply_summary(self, btn):
        with self.output:
            clear_output()
            self.generate_summary(self.file_upload)
     
    def on_apply_sentiment(self, btn):
        with self.output:
            clear_output()
            self.generate_sentiment(self.file_upload)
    
    def get_layout(self):
        return self.container

NLP_app().get_layout()

VBox(children=(Tab(children=(VBox(children=(Accordion(children=(FileUpload(value={}, accept='*.txt', descripti…