## Summarization(web scrapping)

In [72]:
try: 
    from googlesearch import search 
except ImportError:  
    print("No module named 'google' found") 
  
query = "summary of the alchemist"
  
for j in search(query, tld="co.in",num=1,start=0, stop=1, pause=1):  #num and pause
    print(j) 

https://www.sparknotes.com/lit/the-alchemist/summary/?sa=X&ved=2ahUKEwjGqeSkhaPnAhXYyDgGHYQKDGkQFjAAegQIABAB


In [35]:
import requests
from bs4 import BeautifulSoup

In [36]:
url = j

In [37]:
data = requests.get(url)
soup = BeautifulSoup(data.text)

In [38]:
import urllib.request
from inscriptis import get_text
 
url = j
html = urllib.request.urlopen(url).read().decode('utf-8')
 
text = get_text(html)

In [39]:
quote_divs = soup.find_all("div", attrs={"class": "layout-wrapper-2018"})
quote_div = quote_divs[0]
a = quote_div.text
quote_details = a.strip().split("\n")
print(quote_details)

['A recurring dream troubles Santiago, a young and adventurous Andalusian shepherd.', '            He has the dream every time he sleeps under a sycamore tree that grows out of the ruins', '            of a church. During the dream, a child tells him to seek treasure at the foot of the', '            Egyptian pyramids. Santiago consults a gypsy woman to interpret the dream, and to his', '            surprise she tells him to go to Egypt. A strange, magical old man named Melchizedek, who', '            claims to be the King of Salem, echoes the gypsy’s advice and tells Santiago that it is', '            his Personal Legend to journey to the pyramids. Melchizedek convinces Santiago to sell', '            his flock and set off to Tangier. When Santiago arrives in Tangier, a thief robs him,', '            forcing him to find work with a local crystal merchant. The conservative and kindly', '            merchant teaches Santiago several lessons, and Santiago encourages the merchant to take'

In [40]:
quote_text = quote_details[0:15]
quote_text

['A recurring dream troubles Santiago, a young and adventurous Andalusian shepherd.',
 '            He has the dream every time he sleeps under a sycamore tree that grows out of the ruins',
 '            of a church. During the dream, a child tells him to seek treasure at the foot of the',
 '            Egyptian pyramids. Santiago consults a gypsy woman to interpret the dream, and to his',
 '            surprise she tells him to go to Egypt. A strange, magical old man named Melchizedek, who',
 '            claims to be the King of Salem, echoes the gypsy’s advice and tells Santiago that it is',
 '            his Personal Legend to journey to the pyramids. Melchizedek convinces Santiago to sell',
 '            his flock and set off to Tangier. When Santiago arrives in Tangier, a thief robs him,',
 '            forcing him to find work with a local crystal merchant. The conservative and kindly',
 '            merchant teaches Santiago several lessons, and Santiago encourages the merchant

In [41]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords #unnecessary words removal( it,of etc)
from nltk.stem import PorterStemmer #used for categorizing similar meaning words together(eg: jump,jumps)
from nltk.tokenize.treebank import TreebankWordDetokenizer


In [42]:
text_string=TreebankWordDetokenizer().detokenize(quote_details)

In [43]:
def _create_frequency_table(text_string) -> dict:

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable

In [44]:
def _score_sentences(sentences, freqTable) -> dict:
    sentenceValue = dict()

    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        
        for wordValue in freqTable:
            if wordValue in sentence.lower():
                if sentence[:10] in sentenceValue:
                    sentenceValue[sentence[:10]] += freqTable[wordValue]
                else:
                    sentenceValue[sentence[:10]] = freqTable[wordValue]

        sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] // word_count_in_sentence

    return sentenceValue

In [45]:
def _find_average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original text
    average = int(sumValues / len(sentenceValue))

    return average

In [46]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] > (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [54]:
# 1 Create the word frequency table
freq_table = _create_frequency_table(text_string)
print(freq_table)
print("\n")

'''
We already have a sentence tokenizer, so we just need 
to run the sent_tokenize() method to create the array of sentences.
'''

# 2 Tokenize the sentences
sentences = sent_tokenize(text_string)

# 3 Important Algorithm: score the sentences
sentence_scores = _score_sentences(sentences, freq_table)
print(sentence_scores)
print("\n")

# 4 Find the threshold
threshold = _find_average_score(sentence_scores)
print(threshold)
print("\n")

# 5 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.5 * threshold)

print(summary)

{'A': 2, 'recur': 1, 'dream': 8, 'troubl': 1, 'santiago': 31, ',': 45, 'young': 1, 'adventur': 1, 'andalusian': 1, 'shepherd': 1, '.': 38, 'He': 9, 'ha': 2, 'everi': 2, 'time': 2, 'sleep': 1, 'sycamor': 2, 'tree': 3, 'grow': 2, 'ruin': 1, 'church': 3, 'dure': 4, 'child': 1, 'tell': 5, 'seek': 1, 'treasur': 5, 'foot': 2, 'egyptian': 1, 'pyramid': 7, 'consult': 1, 'gypsi': 2, 'woman': 1, 'interpret': 1, 'hi': 18, 'surpris': 1, 'go': 2, 'egypt': 2, 'strang': 1, 'magic': 1, 'old': 1, 'man': 2, 'name': 1, 'melchizedek': 2, 'claim': 1, 'king': 1, 'salem': 1, 'echo': 1, '’': 5, 'advic': 1, 'person': 3, 'legend': 3, 'journey': 3, 'convinc': 2, 'sell': 1, 'flock': 1, 'set': 1, 'tangier': 2, 'arriv': 1, 'thief': 1, 'rob': 1, 'forc': 1, 'find': 3, 'work': 2, 'local': 1, 'crystal': 1, 'merchant': 3, 'conserv': 1, 'kindli': 1, 'teach': 2, 'sever': 2, 'lesson': 1, 'encourag': 1, 'take': 6, 'risk': 2, 'busi': 1, 'pay': 1, 'becom': 2, 'rich': 1, 'year': 1, 'decid': 2, 'cash': 1, 'earn': 1, 'continu': 

### Categorization

In [55]:
from nltk.corpus import brown

In [56]:
def find_similarity(X,Y):
    # tokenization 
    X_list = word_tokenize(X)  
    Y_list = word_tokenize(Y) 
  
    # sw contains the list of stopwords 
    sw = stopwords.words('english')  
    l1 =[];l2 =[] 
  
    # remove stop words from string 
    X_set = {w for w in X_list if not w in sw}  
    Y_set = {w for w in Y_list if not w in sw} 
  
    # form a set containing keywords of both strings  
    rvector = X_set.union(Y_set)  
    for w in rvector: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0
  
    # cosine formula  
    for i in range(len(rvector)): 
            c+= l1[i]*l2[i] 
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
    return cosine

In [64]:
from heapq import nlargest

In [68]:
def find_genre(summary):
    
    g1=brown.sents(categories='mystery')
    g1_len=len(g1)
    g2=brown.sents(categories='editorial')
    g2_len=len(g2)
    g3=brown.sents(categories='fiction')
    g3_len=len(g3)
    g4=brown.sents(categories='humor')
    g4_len=len(g4)
    g5=brown.sents(categories='lore')
    g5_len=len(g5)
    g6=brown.sents(categories='religion')
    g6_len=len(g6)
    g7=brown.sents(categories='romance')
    g7_len=len(g7)
    g8=brown.sents(categories='science_fiction')
    g8_len=len(g8)
    
    X=sent_tokenize(summary)
    X_len=len(X)
    #print(X)
    #print(X_len)
    
    sum1=0
    for i in g1:
        #print(i)
        Y=' '.join(word for word in i)
        #print(Y)
        for ii in X:
            #print(ii)
            a=find_similarity(Y,ii)
            sum1=sum1+a
            avg1=sum1/(g1_len)
    #print(avg1)
    
    sum2=0
    for j in g2:
        #print(i)
        Y=' '.join(word for word in j)
        #print(Y)
        for jj in X:
            #print(ii)
            a=find_similarity(Y,jj)
            sum2=sum2+a
            avg2=sum2/(g2_len)
    #print(avg2)
    
    sum3=0
    for k in g3:
        #print(i)
        Y=' '.join(word for word in k)
        #print(Y)
        for kk in X:
            #print(ii)
            a=find_similarity(Y,kk)
            sum3=sum3+a
            avg3=sum3/(g3_len)
    #print(avg3)
    
    sum4=0
    for l in g4:
        #print(i)
        Y=' '.join(word for word in l)
        #print(Y)
        for ll in X:
            #print(ii)
            a=find_similarity(Y,ll)
            sum4=sum4+a
            avg4=sum4/(g4_len)
    #print(avg4)
    
    sum5=0
    for m in g5:
        #print(i)
        Y=' '.join(word for word in m)
        #print(Y)
        for mm in X:
            #print(ii)
            a=find_similarity(Y,mm)
            sum5=sum5+a
            avg5=sum5/(g5_len)
    #print(avg5)
    
    sum6=0
    for n in g6:
        #print(i)
        Y=' '.join(word for word in n)
        #print(Y)
        for nn in X:
            #print(ii)
            a=find_similarity(Y,nn)
            sum6=sum6+a
            avg6=sum6/(g6_len)
    #print(avg6)
    
    sum7=0
    for o in g7:
        #print(i)
        Y=' '.join(word for word in o)
        #print(Y)
        for oo in X:
            #print(ii)
            a=find_similarity(Y,oo)
            sum7=sum7+a
            avg7=sum7/(g7_len)
    #print(avg7)
    
    sum8=0
    for p in g8:
        #print(i)
        Y=' '.join(word for word in p)
        #print(Y)
        for pp in X:
            #print(ii)
            a=find_similarity(Y,pp)
            sum8=sum8+a
            avg8=sum8/(g1_len)
    #print(avg8)
            
    
            
    l=[avg1,avg2,avg3,avg4,avg5,avg6,avg7,avg8]
    #print(l)
    Tv = {'mystery':avg1,'editorial':avg2,'fiction':avg3,'humor':avg4,'lore':avg5,'religion':avg6,'romance':avg7,'science_fiction':avg8} 
  
    #Keymax = max(Tv, key=Tv.get) 
    #print(Keymax) 
     
    three_largest = nlargest(3,Tv , key=Tv.get)
    print(three_largest) 


In [73]:
find_genre(summary)

['mystery', 'romance', 'fiction']
