# Task 2
## Instructions
Create a new Jupyter Notebook on your system.
Complete your own implementation of a web spider. 
Combine it with the linguistic analysis code we saw previously, and use it to generate summaries of the pages. You can do this in one of two ways, either:

1. Scrape a single page and then run your code on this page.
2. Scrape a series of interlinked pages (e.g. looking for the a href element) and summarise each individual page plus a broad summary of all pages.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

r = requests.get('https://citizen.co.za/category/news-world')
data = r.text
soup = BeautifulSoup(data, 'html.parser')

In [3]:
articles = soup.find('div', {'class':'article-leads'}).find_all('a', 
                        href=re.compile('https://citizen.co.za/news/news-world/*'))
print(articles)

[<a href="https://citizen.co.za/news/news-world/2456371/french-mps-back-setting-age-of-sexual-consent-at-15/" title="Link to French MPs back setting age of sexual consent at 15">
<img alt="French MPs back setting age of sexual consent at 15" class="img-responsive" src="https://citizen.co.za/wp-content/uploads/2021/03/iStock-897099028-389x259.jpg"/>
</a>, <a href="https://citizen.co.za/news/news-world/2456371/french-mps-back-setting-age-of-sexual-consent-at-15/" title="Link to French MPs back setting age of sexual consent at 15">French MPs back setting age of sexual consent at 15</a>, <a href="https://citizen.co.za/news/news-world/2456350/watch-prince-philip-discharged-from-hospital/" title="Link to WATCH: Prince Philip discharged from hospital">
<img alt="WATCH: Prince Philip discharged from hospital" class="img-responsive" src="https://citizen.co.za/wp-content/uploads/2021/03/000_9639CY-389x259.jpg"/>
</a>, <a href="https://citizen.co.za/news/news-world/2456350/watch-prince-philip-dis

In [4]:
links = []
for i in range(8):
    links.append(articles[i].get('href'))

In [5]:
links_select = links[::2]
print(links_select)

['https://citizen.co.za/news/news-world/2456371/french-mps-back-setting-age-of-sexual-consent-at-15/', 'https://citizen.co.za/news/news-world/2456350/watch-prince-philip-discharged-from-hospital/', 'https://citizen.co.za/news/news-world/2456109/33-years-after-iraq-chemical-attack-survivors-still-seeking-justice/', 'https://citizen.co.za/news/news-world/2456098/jabs-in-isolation-how-a-remote-portuguese-island-dodged-covid/']


In [12]:
article_dict = {}
docs = []

for link in links_select:
    r = requests.get(link)
    data = r.text
    soup = BeautifulSoup(data, 'html.parser')
    article_title = soup.find('h1', {'class':'single-headline'}).text
    article_headline = soup.find('h2', {'class':'single-excerpt'}).text
    article_dict[article_title] = article_headline
    
    p_tags = soup.find('div', {'class':'single-content'}).find_all('p')
    p_tags_text = [tag.get_text().strip() for tag in p_tags]
    
    sentence_list = [sentence for sentence in p_tags_text if not '\n' in sentence]
    sentence_list = [sentence for sentence in p_tags_text if '.' in sentence]
    
    article = ' '.join(sentence_list)
    docs.append(article)

In [13]:
#from another student
import numpy as np 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ChrisGough/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ChrisGough/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ChrisGough/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
tokens = []
texts = []
all_texts = []

for d in docs:
    data = d
    
    #tokenize each doc into words
    word_token = word_tokenize(data)
    
    #change tokens into lower case and remove non alphanumeric characters
    word_token = [t.lower().strip() for t in word_token if t.isalpha()]
    
    #remove stopwords
    word_token = [t for t in word_token if t not in stopwords.words('english')]
    
    #lemmatize the tokens
    wordnet_lemmatizer = WordNetLemmatizer()
    word_token = [wordnet_lemmatizer.lemmatize(t) for t in word_token]
    
    texts.append(word_token)
    
    #append unique tokens in token
    for t in word_token:
        if t not in tokens:
            tokens.append(t)
            
#combine each list in texts into all_texts
for i in texts:
    all_texts += i
    
print(all_texts)

['french', 'lawmaker', 'backed', 'bill', 'late', 'monday', 'setting', 'minimum', 'age', 'sexual', 'consent', 'marking', 'major', 'step', 'country', 'traditionally', 'permissive', 'attitude', 'sex', 'member', 'lower', 'house', 'parliament', 'voted', 'unanimously', 'bring', 'france', 'consent', 'law', 'line', 'western', 'country', 'following', 'wave', 'allegation', 'sexual', 'abuse', 'incest', 'described', 'france', 'second', 'metoo', 'movement', 'bill', 'sex', 'child', 'would', 'considered', 'rape', 'punishable', 'year', 'prison', 'unless', 'small', 'age', 'gap', 'two', 'partner', 'justice', 'minister', 'eric', 'said', 'vote', 'sent', 'clear', 'message', 'child', 'current', 'french', 'law', 'prosecutor', 'prove', 'minor', 'forced', 'threatened', 'tricked', 'sex', 'adult', 'order', 'bring', 'charge', 'rape', 'sexual', 'assault', 'draft', 'law', 'initiated', 'member', 'senate', 'suggested', 'age', 'consent', 'set', 'would', 'one', 'lowest', 'europe', 'president', 'emmanuel', 'macron', 'go

In [15]:
def idf(texts, tokens):
    
    idf_values = {}
    
    for t in tokens:
        doc_freq = 0
        
        for text in texts:
            if t in text:
                doc_freq += 1
        idf_values[t] = np.log10(len(texts)/(doc_freq+1))
    return idf_values

In [16]:
idf = idf(all_texts, tokens)
sort_orders = sorted(idf.items(), key=lambda x: x[1], reverse=True)

In [17]:
sort_orders[0:10]

[('lawmaker', 2.7168377232995247),
 ('backed', 2.7168377232995247),
 ('monday', 2.7168377232995247),
 ('setting', 2.7168377232995247),
 ('minimum', 2.7168377232995247),
 ('marking', 2.7168377232995247),
 ('traditionally', 2.7168377232995247),
 ('permissive', 2.7168377232995247),
 ('attitude', 2.7168377232995247),
 ('parliament', 2.7168377232995247)]