#Import the necessary Libraries

In [1]:
import pandas as pd
import csv
import nltk
nltk.download("all")
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.tokenize import WordPunctTokenizer,word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

KeyboardInterrupt: ignored

#Load the Dataset

The dataset is uploaded on the GitHub repository and is fetched directly. To access the content for the articles directly, the index is set to the 'Heading' of the article which helps in accessing the text by using the corresponding heading of the same.

In [0]:
url="https://raw.githubusercontent.com/Aditi2806/Articles-Dataset/master/Articles%20Dataset.csv"
dataframe=pd.read_csv(url,index_col="Heading",encoding="utf-8")

#Pre-processing the text

The text needs to cleaned before taking it as an input. The text is removed of stopwords, punctuations, and other errors. They are tokenized into individual words and n-grams are created to better understand the text. The function 'ngram' is used to implement the same. The function returns a list of the created ngrams which is used to act as the input to our algorithm.

In [0]:
def ngram(temp_pos):
    '''create positive and negative bigram and trigram models for both category of reviews'''
    bigram_pos=gensim.models.Phrases(temp_pos,min_count=1,threshold=1)                  #defining positive bigram model which takes a list of lists as input
    trigram_pos=gensim.models.Phrases(bigram_pos[temp_pos],min_count=1,threshold=1)     #defining positive bigram model which takes a list of lists as input
    bigram_pos_mod=gensim.models.phrases.Phraser(bigram_pos)                            #computes the bigrams present in the list
    trigram_pos_mod=gensim.models.phrases.Phraser(trigram_pos)                          #computes the trigrams taking the computed bigrams as input

    return [trigram_pos_mod[bigram_pos_mod[doc]] for doc in temp_pos]

In [0]:
def lda(temp_pos):
    '''defining necessary parameters to LDA model'''

    id2word=corpora.Dictionary(temp_pos)                                                                    #create a dictionary of the positive review list
    texts=temp_pos                                                                                          #create the corpus of the text
    corpus=[id2word.doc2bow(text) for text in texts]                                                        #convert the dictionary created above to bag of words form
    lda_model=gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word,num_topics=3,passes=10)        #define the LDA model and give the above created parameters as input
    _,pos_keywords=zip(*(lda_model.print_topics()))                                                         #store the keywords for every review in a variable
    pos_keywords=[y for x in pos_keywords for y in x.split("\"") if y !='' and y[0].isalpha()]              #modify the list by extracting only the words from the list and not the score computed for them
    pos_keywords=list(set(pos_keywords))                                                                    #consider only the distinct keywords present in the list and creating the final positive keywords list
    return pos_keywords

#NLTK Algorithm

The function '_create_frequency_table' takes in input the text and applied pre-processing to it. It tokenized the text into words and cleans it to remove stopwords and other terms that add little meaning to the overall text. This function is used to create a frequency table which holds the frequencies of various words present in the text. It tells about the occurrence of each distinct word, which acts as the base of the algorithm. These frequencies are used to score the sentences based upon the number of high frequency words appearing in the sentence.

In [0]:
def _create_frequency_table(text_string) -> dict:

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable

#Scoring the sentences

>The below function takes in the generated frequency table to calculate the scores for every sentence. To rank the sentences, there must be a value that will be used to score the sentences. The sentences are scored based upon the words and their corresponding frequencies in the frequency table. The overall score is calculated by dividing the sum of individual word values with the total number of words present in the sentence.

>The '_find_average_score' function takes in the calculated sentence scores to find an average score value. This value is used to set the threshold value for deciding whether to keep a sentence in summary or to discard it. Any sentence having sentence score above this threshold are accepted and those having values less than threshold are rejected and not included in the summary.

In [0]:
def _score_sentences(sentences, freqTable) -> dict:
    sentenceValue = dict()

    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        for wordValue in freqTable:
            if wordValue in sentence.lower():
                if sentence[:10] in sentenceValue:
                    sentenceValue[sentence[:10]] += freqTable[wordValue]
                else:
                    sentenceValue[sentence[:10]] = freqTable[wordValue]

        sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] // word_count_in_sentence

    return sentenceValue

{sentence1:score, sentence2:score, sentence3: score,...........}


In [0]:
def _find_average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original text
    average = int(sumValues / len(sentenceValue))

    return average

#Generating the final summary

This function return the final output generated based upon the sentences and their scores.

In [0]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] > (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

#Executing the algorithm on the dataset

>The algorithm first creates a list of distinct article heading present in the text to iterate over the dataset easily. Each article is processed at a time and a summary for the same will be generated. Tokenizing algorithms:'Word probability' and 'Tf-Idf' are applied onto the text to create the vector representation of the text to be fed in as input. The word probability method work supon the frequency for a word and plots the 30 most common words present in the text. The Tf-Idf method calculates a tf-idf score for the word and takes an average of all the tf-idf scores for a particular word at the end to provide scores to every word in the text. The graph of words having highest tf-idf scores is plotted using matplotlib.

>These representations are fed into our functions to score the sentences using the above defined functions and create the final summary.

In [0]:
article_name=set([i for i in dataframe.index])
tokenizer=WordPunctTokenizer()
dataframe['NLTK Summary']=""
nltk_summary=[]
for row in article_name:
    word_list=[]
    print("\n\n\nArticle Heading:\t",row)     
    #print("\nContent:",dataframe['Content'][row])                                                            
    
    
    word_list_tfidf=[]
    tfidf_dict={}
    tfidf=TfidfVectorizer(min_df=1,max_df=0.5,ngram_range=(1,3))
    try:
        
        '''word probability'''
        token=tokenizer.tokenize(dataframe['Content'][row].lower())
        for tokens in token:
            if (tokens in stopwords.words('english')) or (tokens in [",",".","'","``","''",";","?","--",")","(",":","!","\"","\'","/"]):
                token.remove(tokens)
        word_list = word_list+token

        '''Tf-Idf Method'''
        word_list_tfidf=nltk.tokenize.sent_tokenize(dataframe['Content'][row])
        '''extraction of positive keywords from positive reviews list and negative keywords from negative reviews list'''
        temp=[]
        
        c=gensim.utils.simple_preprocess(dataframe['Content'][row],deacc=True)
        c=[word for word in c if word not in stopwords.words('english')]       
        temp.append(c)                                                      
        
        '''create positive and negative bigram and trigram models for both category of reviews'''
        temp=ngram(temp)                                                        

        #'''to start the topic modelling using LDA model'''
        #keywords=lda(temp)

        #print("\n\nKeywords extracted from the text are: ",keywords)
        #print("\n\n")

        '''Computing and Plotting Word Probability Scores'''
        frequency = nltk.FreqDist(word_list)
        word_prob_words,_=zip(*(frequency.most_common(30)))
        word_prob_words=list(word_prob_words)
        plt.figure(figsize=(5,5))
        frequency.plot(30,title='Word Probability Method')

        '''Computing and Plotting Tf-Idf scores'''
        features=tfidf.fit_transform(word_list_tfidf)
        tfidf_data=pd.DataFrame(features.toarray(),columns=tfidf.get_feature_names())
        #print(tfidf_data)
        for i in tfidf_data.index:
            for x,y in zip(tfidf_data.iloc[i,:],tfidf.get_feature_names()):
                if x!=0:
                    if y in tfidf_dict.keys():
                        tfidf_dict[y]=(tfidf_dict[y]+x)/2
                    else:
                        tfidf_dict[y]=x
        sorted_c= sorted(tfidf_dict.items(), key=lambda kv: kv[1], reverse=True)
        x,y= (zip(*sorted_c))
        top_x=x[:30]
        top_y=y[:30]
        plt.figure(figsize=(5,5))
        plt.plot(top_x,top_y)
        plt.xticks(top_x,top_x,rotation='vertical')
        plt.xlabel('Sample')
        plt.ylabel('Tf-Idf Score')
        plt.title('TF-IDF Method')
        plt.show()
        
        text = dataframe['Content'][row]
        freq_table = _create_frequency_table(text)
        #print(pd.DataFrame(freq_table))

        sentences = sent_tokenize(text)
        sentence_scores = _score_sentences(sentences, freq_table)
        #print(sentence_scores)
        threshold = _find_average_score(sentence_scores)
        summary = _generate_summary(sentences, sentence_scores, 1.5 * threshold)
        dataframe['NLTK Summary'][row]=summary
        print("\n\nGenerated Summary according to the text:\n ",summary)
    except:
        pass
#print(dataframe)

Output hidden; open in https://colab.research.google.com to view.

In [0]:
dataframe[:10]

Unnamed: 0_level_0,Unnamed: 0,Content,NLTK Summary
Heading,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9 Tips For Training Lightning-Fast Neural Networks In Pytorch,0,"Let’s face it, your model is probably still st...","(yup, that just happened). ). Set Trainer(pre..."
How To Become A One-Drink Wonder,1,"Anyone can publish on Medium per our Policies,...","It’s quite a conundrum.The answer? My friend,..."
Treat Yourself Like a CEO and You’ll Make 10x More Income,2,"As I wrote that headline, an old joke came to ...","Friendly eyes. Warm handshake. Raised chin, s..."
Bored? 7 Fun Things You Can Build,3,There is no real secret when it comes to becom...,"Unfortunately, there are no shortcuts. You’ve..."
First AI Model of the Universe Knows Science it was Never Taught,4,A new 3D model of the Universe developed by an...,A new 3D model of the Universe developed by a...
10 Bad Habits of Unsuccessful People,5,The first successful person I ever met — truly...,“Then 20. Do you have a goal to get healthier...
Amazon Accidentally Sent Out Their Email Template,6,It’s comforting to see that even the titans of...,"As the template states, the headline must sum..."
When Women ‘Dangle the Steak' in Front of Men,7,"I truly thought that by now, there wasn’t an o...",I was sexually assaulted by my boyfriend at t...
Why Do Men’s Legacies Matter More Than Women’s Safety?,8,Almost immediately after Washington Post repor...,Almost immediately after Washington Post repo...
Is A.I. the Antichrist?,9,It may seem that old religious principles woul...,"I went down the rabbit hole, watching videos ..."


#Downloading the Result

The summaries generated are stored in a dataframe and converted into an excel file named 'NLTK Result.xlsx'.

In [0]:
dataframe.to_excel('NLTK Result.xlsx',encoding='utf8')

In [0]:
from google.colab import files
files.download('NLTK Result.xlsx')

In [0]:
{'word1':45,'word2':42,''}
sentence1='it was a good movie'
it, was,good, movie, the, ',', 
it's
it was a good movie.jhdskjbdk
sentence1_score = 20+31+56+45//number of words in the sentence
