In [1]:
import requests

In [87]:
# %load ../app/app.py
import requests
import urllib
import re
from bs4 import BeautifulSoup


API_URI = 'https://storyweaver.org.in/api/v1/'


def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def get_pages_info(resp):
    '''Parse the request from the storyweaver api and get text of the story book
    '''
    pages = resp['data']['pages']
    parsed_info = {'texts': [],
                   'image_url': None,
                   'title': None}
    for page in pages:
        if page['pageType'] == 'FrontCoverPage':
            if parsed_info['image_url'] is None:
                parsed_info['image_url'] = page['coverImage']['sizes'][1]['url']
            if parsed_info['title'] is None:
                soup = BeautifulSoup(page['html'])
                title = soup.findAll("p", {"class": "cover_title"})[0].text
                parsed_info['title'] = title
        if page['pageType'] == 'StoryPage':
            cleantext = BeautifulSoup(page['html'], "lxml").text.replace('\n', ' ').replace('  ','')
            # remove unicode
            cleantext = unicodedata.normalize('NFKC', cleantext).replace('\"', '')
            parsed_info['texts'].append(cleantext)
    parsed_info['text_str'] = ' '.join(parsed_info['texts'])
    return parsed_info


In [294]:
# %load ../scripts/freq_word_extractor.py
from collections import Counter
from nltk import ngrams
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder


STOP_WORDS = set(stopwords.words('english'))


def get_ngrams(tokens, n):
    return [' '.join(list(words)) for words in list(ngrams(tokens, n))]


def clean_and_tokenize_text(text):
    table = str.maketrans('', '', string.punctuation)
    # remove punctuations, lower and tokenize the text
    stripped = [w.translate(table) for w in text.lower().split()]
    [STOP_WORDS.add(word) for word in ['said', 'says',
                                       'saying', 'ask',
                                       'asking', 'like',
                                       'say']]
    words = [word for word in stripped if word.isalpha() and word not in STOP_WORDS]
    return words


def get_best_keywords(text):
    # pick top n based on distance from the max frequency
    words_df = pd.DataFrame(get_top_k_n_words(text, 20), columns=['word', 'freq'])
    words_df['normalized_freq'] = words_df.apply(lambda x: x.freq + len(x.word.split()), axis=1)
    words_df['z_score'] = (words_df.normalized_freq - words_df.normalized_freq.mean()) / words_df.normalized_freq.std(ddof=0)
    return list(words_df[words_df.z_score > 1].word.values)


def get_top_k_n_words(text, k=5, n=2):
    tokens = clean_and_tokenize_text(text)
    ngrams = get_ngrams(tokens, n)
    freq = Counter(tokens + ngrams)
    return freq.most_common(k)


def get_top_bigrams(text, n):
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(clean_and_tokenize_text(text))
    finder.apply_freq_filter(2)
    return [' '.join(list(words)) for words in finder.nbest(bigram_measures.raw_freq, n)]


def get_freq_keywords(text):
    collocations = get_top_bigrams(text, 5)
    freq = [word for word, freq in get_top_k_n_words(text, 10)]
    combined_tags = set(collocations + freq + get_best_keywords(text))
    return [word for word in list(combined_tags) if len(word) > 3][:10]


In [295]:
def get_text_from_book_link(book_link):
    link = API_URI + 'stories/{}/read'.format(book_link)
    resp = requests.get(link).json()
    parsed_resp = get_pages_info(resp)
    text = ' '.join(parsed_resp['texts'])
    return text


- Extract keyword from title
    - Try Pos Tags
- Extract keywords from text
    - Co-occurence
    - Frequency

In [298]:
book_links = ['28270-anna-s-extraordinary-experiments-with-weather',
              '34911-the-case-of-the-missing-water',
              '7-fat-king-thin-dog',
              '44659-meera-and-ameera',
              '26690-miss-laya-s-fantastic-motorbike-does-not-like-fruits']

for book_link in book_links:
    text = get_text_from_book_link(book_link)  
    collocations = get_top_bigrams(text, 5)
    freq = [word for word, freq in get_top_k_n_words(text, 10)]
    combined_tags = set(collocations + freq + get_best_keywords(text))
    print('tags for {}'.format(book_link))
    print([word for word in list(combined_tags) if len(word) > 3][:10])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


tags for 28270-anna-s-extraordinary-experiments-with-weather
['august anna', 'weather', 'anna mani', 'scientist', 'many', 'anna', 'books anna', 'books books', 'mani', 'birthday']
tags for 34911-the-case-of-the-missing-water
['pump man', 'finally sat', 'climbed tank', 'find', 'tank', 'know water', 'stream', 'school', 'ranj sapna', 'sapna']
tags for 7-fat-king-thin-dog
['thin', 'king thin', 'dog fat', 'thin dog', 'king', 'runs', 'run run', 'fat king', 'bird']
tags for 44659-meera-and-ameera
['everything likes', 'person', 'know', 'make', 'ameera', 'meera', 'favourite person', 'everything', 'favourite', 'world know']
tags for 26690-miss-laya-s-fantastic-motorbike-does-not-like-fruits
['fantastic motorbike', 'dhup', 'times', 'motorbike', 'pineapples', 'chandra', 'fantastic', 'laya', 'claps six', 'clapclap']


In [299]:

from nltk.tokenize import wordpunct_tokenize

In [301]:
??wordpunct_tokenize