## Part 1: For each website, create a JSON object of the page that have

In [9]:
import requests
from html.parser import HTMLParser
import re
import csv
import os
import os.path
import pprint
import json

# Search a topic based on keywords
# Save all urls in a list
# From the content of the urls from that list, grab images, sentence contained, outgoing links, last modified
# Topic: felines

def request_page(url):
    '''makes a get request. Returns request object'''
    try:
        r = requests.get(url, allow_redirects=True)
        return r
    except:
        return 'exception'

def get_sentences(request_result, keywords):
    '''Gets sentences (string of letters that end in a period) that use keywords from a given url requested
    Input: r object
    Output: set of strings (of sentences). String 'exception' if there is an exception'''

    r = request_result

    if r != 'exception':
        sentences = re.findall(" [a-zA-Z ]+\.", str(r.content))
        topic_sentences = [sent for sent in sentences if any(keyword in sent for keyword in keywords)]
        return set(topic_sentences)
    else:
        return 'exception'

def get_last_modified(request_result):
    '''Returns last modified if available, otherwise it returns an empty string'''
    
    r = request_result

    if r != 'exception':
        header = requests.head(url).headers

        if 'Last-Modified' in header:
            return header['Last-Modified']
        else:
            return ''
    else:
        return 'exception'

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        '''writes all images in a file and outgoing links in another file'''
        if tag.lower() == 'img' and 'src' in (k.lower() for k, v in attrs):
            with open('images.csv', 'a') as img_file:
                img_file.write(dict(attrs).get('src') + ',')

        if tag.lower() == 'a' and 'href' in (k.lower() for k, v in attrs):
            with open('outgoing_links.csv', 'a') as links_file:
                links_file.write(dict(attrs).get('href') + ',')
                
def file_to_list(afile):
    '''Returns a list out of a csv file and deletes the file'''
    with open(afile) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            alist = set(list(csv_reader)[0])
    os.remove(afile)
    return alist

def get_images(request_result):
    '''Gets all images from a request object. 
    Input: r object
    Output: list strings (of image urls)'''

    r = request_result

    if r != 'exception':
        parser = MyHTMLParser()
        parser.feed(str(r.content))

        if os.path.isfile('images.csv'):
            images = list(file_to_list('images.csv'))
        else:
            images = []
        return images
    else:
        return 'exception'

def count_outlinks(request_result):
    '''Gets all images from a given url request result. 
    Input: r object
    Output: integet (count of image urls)'''

    r = request_result

    if r != 'exception':
        parser = MyHTMLParser()
        parser.feed(str(r.content))

        if os.path.isfile('outgoing_links.csv'):
            out_links = file_to_list('outgoing_links.csv')
        else:
            out_links = []

        return len(out_links)
    else:
        return 'exception'


In [10]:
keywords = ['cat', 'tiger', 'lion']
N_SENTENCES = 3
N_IMAGES = 3
topic_urls = []

for keyword in keywords:
    url = 'https://www.google.com/search?q='+ keyword
    r = requests.get(url, allow_redirects=True)
    keyword_urls = re.findall("https://[^<]+", str(r.content))
    topic_urls.extend(keyword_urls)

# topic_pages = [(url1, {'images':  [img1, img2, img3], 
#                   'sentences': [sentence1, sentence2, sentence3], 
#                   'n_outlinks': n, 
#                   'last_modified': date})
#             (url2, {}),
#             (url3, {})...]

topic_pages = []

for url in topic_urls:
    url_dict = {}
    if 'google' not in url and 'gstatic' not in url and 'youtube' not in url:
        r = request_page(url)
        if r != 'exception' and len(get_sentences(r, keywords)) != 0:
            url_dict['sentences'] = list(get_sentences(r, keywords))[:N_SENTENCES]
            img = get_images(r)[:N_IMAGES]
            img.extend((N_IMAGES - len(img))*['']) # to make sure all have the same number of images.
            url_dict['images'] = img
            url_dict['n_outlinks'] = count_outlinks(r)
            url_dict['last_modified'] = get_last_modified(r)
            url_tup = tuple((url, url_dict))
            topic_pages.append(url_tup)

json.dump(topic_pages, open('topic_pages.json', 'w'), indent=4)
pprint.pprint(topic_pages)


[('https://www.cat.com/&amp;sa=U&amp;ved=0ahUKEwiXuZDz-Z7hAhXWvp4KHbYMBMwQFggUMAA&amp;usg=AOvVaw1zB9zKWUPdce9NjLrngZ-E">',
  {'images': ['',
              'https://www.awf.org/sites/default/files/styles/featured_animals/public/media/gallery/wildlife/Lion/Lion_Billy_Dodson.jpg?itok=Q9a6icy1',
              'https://www.awf.org/sites/default/files/styles/featured_animals/public/media/masthead/new_gorilla_masthead.jpg?itok=rTCQj1tC'],
   'last_modified': '',
   'n_outlinks': 189,
   'sentences': [' Check out our extensive online parts catalog.']}),
 ('https://www.cat.com/%252Bcat%26hl%3Den%26ct%3Dclnk&amp;sa=U&amp;ved=0ahUKEwiXuZDz-Z7hAhXWvp4KHbYMBMwQIAgXMAA&amp;usg=AOvVaw3RFbqPYSlgAiP1miZnLkjq">Cached',
  {'images': ['',
              'http://s7d2.scene7.com/is/image/Caterpillar/C555056?$cc-s$',
              'http://s7d2.scene7.com/is/image/Caterpillar/C755332?$cc-s$'],
   'last_modified': '',
   'n_outlinks': 189,
   'sentences': [' Check out our extensive online parts catalog.']}),
 (

## Part 2: Flatten the output of part 1 by sentences and write to a CSV

In [15]:
def flatten_by_sentence(json_obj):
    '''Takes the json object created in part one:
        topic_pages = [
                    (url1, {'images':  ['https://aaa.com/a.jpg'], 
                          'sentences': [' lions roamed the Earth.', sentence12, sentence13], 
                          'n_outlinks': 5, 
                          'last_modified': 10:20:40 pm Apr 3 2019})
                    (url2, {'images': ['https://bbb.com/b.jpg', 'https://ccc.com/c.jpg'], 
                            'sentences': ['a fearsome Bengal tiger.', sentence22, sentence23]
                            'n_outlinks': 5, 
                            'last_modified': ''}),
                    (url3, {})...
                    ]
    and flattens it to be of the form:
    entence,                    image1,                 image2,                 image3, n outgoing links, last modified
    “ lions roamed the Earth.”, “https://aaa.com/a.jpg”, ?,                         ?,          5,          10:20:40 pm Apr 3 2019
    sentence12                , “https://aaa.com/a.jpg”, ?,                         ?,          5,          10:20:40 pm Apr 3 2019
    sentence13                , “https://aaa.com/a.jpg”, ?,                         ?,          5,          10:20:40 pm Apr 3 2019
    “ a fearsome Bengal tiger.”, “https://bbb.com/b.jpg”, “https://ccc.com/c.jpg”,  ?,          5,          ?
    sentence22                 , “https://bbb.com/b.jpg”, “https://ccc.com/c.jpg”,  ?,          5,          ?
    sentence23                 , “https://bbb.com/b.jpg”, “https://ccc.com/c.jpg”,  ?,          5,          ?
    '''

    with open(json_obj) as json_data:
        topic_data = json.load(json_data)

    for url_tup in topic_data:
        sentences = url_tup[1]['sentences']
        images = url_tup[1]['images']
        n_outlinks = url_tup[1]['n_outlinks']
        last_modified = url_tup[1]['last_modified']
        
        chunk = []
        for sentence in sentences:
            line = []
            line.append(sentence)
            line.extend(images)
            line.append(n_outlinks)
            line.append(last_modified)
            chunk.append(line)

        with open('topic_pages.csv', 'a') as csvfile:
            csvwriter = csv.writer(csvfile) 
            csvwriter.writerows(chunk)
            print(chunk)

flatten_by_sentence('topic_pages.json')

[[' Check out our extensive online parts catalog.', '', 'https://www.awf.org/sites/default/files/styles/featured_animals/public/media/gallery/wildlife/Lion/Lion_Billy_Dodson.jpg?itok=Q9a6icy1', 'https://www.awf.org/sites/default/files/styles/featured_animals/public/media/masthead/new_gorilla_masthead.jpg?itok=rTCQj1tC', 189, '']]
[[' Check out our extensive online parts catalog.', '', 'http://s7d2.scene7.com/is/image/Caterpillar/C555056?$cc-s$', 'http://s7d2.scene7.com/is/image/Caterpillar/C755332?$cc-s$', 189, '']]
[[' Check out our extensive online parts catalog.', '', 'http://s7d2.scene7.com/is/image/Caterpillar/C555056?$cc-s$', 'http://s7d2.scene7.com/is/image/Caterpillar/C755332?$cc-s$', 189, '']]
[[' Check out our extensive online parts catalog.', '', 'http://s7d2.scene7.com/is/image/Caterpillar/C555056?$cc-s$', 'http://s7d2.scene7.com/is/image/Caterpillar/C755332?$cc-s$', 189, '']]
[[' Check out our extensive online parts catalog.', '', 'http://s7d2.scene7.com/is/image/Caterpill