# Extract HTML content

In [42]:
import requests
from bs4 import BeautifulSoup
import time

In [53]:
# phrase that appears if we can't extract all html
stop_phrase = "Your browser does not support the audio element."

In [55]:
nPages = 128 # 128 pages gives about 3200 articles (25 per page)
count_articles = 0 # count total number of articles we have

# save article html
article_html = []

# iterate through url pages to get list of article HTML's
for i in range(2, nPages + 2):
    page_url = f"https://www.nature.com/latest-news?page={i}"
    
    # create soup object for each page
    r = requests.get(page_url)
    soup = BeautifulSoup(r.content, "html.parser")
    
    # get HTML of all articles on current page
    page_html = soup.find_all("div", {"class": "c-article-item__content c-article-item--with-image"})
    
    # iterate over each article
    for article in page_html:
        # In each 'a' tag, 'href' contains the article link
        link = "https://www.nature.com" + article.find("a")["href"] 
        
        # create soup object for each article
        r2 = requests.get(link)
        soup2 = BeautifulSoup(r2.content, "html.parser")
        
        # first h3 tag is the article title
        title = article.find("h3").text 
        
        # append article title and html to list to write to txt file later
        file_name = title.replace(" ", "_") 
        article_html.appen((file_name, r2.content))
        
        # get date, check if missing
        date_html = soup2.find_all("li", {"class": "c-article-identifiers__item"})
        date = date_html[-1].text if date_html != [] else None # last element is date
        
        # some authors are missing, check if None
        author_html = soup2.find("li", {"class": "c-author-list__item"})
        author = author_html.find("a").text if author_html is not None else None
        
        # get body of each article
        body_html = soup2.find("div", {"class": "c-article-body u-clearfix"})
        # body_html will be None if article is in a different html format than usual
        body = []
        if body_html is not None:
            for p in body_html.find_all("p"):
                # check if we could get html
                if stop_phrase in p.text.strip():
                    continue
                else:
                    body.append(p.text.strip())
            str_body = " ".join(body) # turn text into 1 string instead of list of sentences
        
        # write title, author, date, body to csv file 
        # use | as delimiter
        with open("articles.txt", "a") as fa:
            fa.write(f"{title}|{author}|{date}|{str_body}\n")  
        
        count_articles += 1
        if count_articles % 200 == 0: print(f"Extracted {count_articles} articles.")
        
        # sleep for 1 second to not overload website
        time.sleep(1)
        
print(f"{count_articles} total articles have been saved.")

Extracted 200 articles.
Extracted 400 articles.
Extracted 600 articles.
Extracted 800 articles.
Extracted 1000 articles.
Extracted 1200 articles.
Extracted 1400 articles.
Extracted 1600 articles.
Extracted 1800 articles.
Extracted 2000 articles.
Extracted 2200 articles.
Extracted 2400 articles.
Extracted 2600 articles.
Extracted 2800 articles.
Extracted 3000 articles.
3198 total articles have been saved.


# Write Article HTML to Text File

In [None]:
for article in article_html:
    title = article[0]
    html = article[1]
    with open(f"project_htmls/{title}.html", "w+b") as fw:
        fw.write(html)

# Extract sentences from articles

In [48]:
### Get list of all sentences in all articles and save in a file ###
import nltk

# get all html files from folder
html_files = [file for file in os.listdir("project_htmls") if file.endswith(".html")]

# iterate through all articles to extract text
sentences = []
count = 0
for article in html_files:
    # read html content of each article
    with open(f"project_htmls/{article}", "r+b") as fr:
        soup = BeautifulSoup(fr.read(), "html.parser")
        
        # get text of each article
        body_html = soup.find("div", {"class": "c-article-body u-clearfix"})
        
        # body will be None if article is in a different html format than usual
        
        if body_html is not None:
            txt = []
            for p in body_html.find_all("p"):
                # check if we could get html
                if stop_phrase in p.text.strip():
                    continue
                else:
                    txt.append(p.text.strip())
            for paragraph in txt:
                tokenized_sent = nltk.sent_tokenize(paragraph)
                # tokenized_sent is a list of all sentences in the current article
                # add sentences to sentences list
                for sent in tokenized_sent:
                    sentences.append(sent.strip()) # strip white space from sentences
            
            count += 1
            if count % 100 == 0: print(f"Extracted sentences from {count} articles.")
            

# write all sentences to a file (each sentence on new line)
with open("project_sentences.txt", "w") as fw:
    for s in sentences:
        fw.write(f"{s}\n")

print(f"\nExtracted {len(sentences)} sentences from {count} articles.")

Extracted sentences from article 100.
Extracted sentences from article 200.
Extracted sentences from article 300.
Extracted sentences from article 400.
Extracted sentences from article 500.
Extracted sentences from article 600.
Extracted sentences from article 700.
Extracted sentences from article 800.
Extracted sentences from article 900.
Extracted sentences from article 1000.
Extracted sentences from article 1100.
Extracted sentences from article 1200.
Extracted sentences from article 1300.
Extracted sentences from article 1400.
Extracted sentences from article 1500.
Extracted sentences from article 1600.
Extracted sentences from article 1700.
Extracted sentences from article 1800.
Extracted sentences from article 1900.
Extracted sentences from article 2000.
Extracted sentences from article 2100.
Extracted sentences from article 2200.
Extracted sentences from article 2300.
Extracted sentences from article 2400.
Extracted sentences from article 2500.
Extracted sentences from article 2

# Categorize sentences into positive, negative, and neutral

In [39]:
### categorize sentences into positive, negative, and neutral ###
import pandas as pd
from textblob import TextBlob

# read all sentences
with open("project_sentences.txt", "r") as fr:
    s = [line.strip() for line in fr]
    
    # perform sentiment analysis from TextBlob() for sampling purposes
    df = pd.DataFrame({"sentence": s})
    # drop duplicates
    df = df.drop_duplicates(keep="first")
    
    df["polarity"] = df.sentence.apply(lambda x: TextBlob(x).polarity) # get polarity of each sentence
    df["subjectivity"] = df.sentence.apply(lambda x: TextBlob(x).subjectivity) # get subjectivity of each sentence
    
    # categorize into positive, negative, and neutral based on 0.5 polarity and above 0.6 subjectivity
    # note: subjectivity allows us to get less neutral sentences
    positive = df[(df.polarity >= 0.5) & (df.subjectivity >= 0.6)]
    neutral = df[(-0.5 < df.polarity) & (df.polarity < 0.5) & (df.subjectivity >= 0.6)]
    negative = df[(df.polarity <= -0.5) & (df.subjectivity >= 0.6)]

In [40]:
print(len(positive.sentence),  len(negative.sentence), len(neutral.sentence))

3644 1388 19300


## Sample sentences

In [44]:
### sample 40% positive, 40% negative, and 20% netural for a total of 2000 sentences ###
### Export sample sentences to Excel for labeling ###

pos_sample = positive.sentence.sample(n=800, ignore_index=True, random_state=1)
neg_sample = negative.sentence.sample(n=800, ignore_index=True, random_state=3)
neutral_sample = neutral.sentence.sample(n=400, ignore_index=True, random_state=5)

# combine dataframes and shuffle sentences
sample_sentences = pd.concat([pos_sample, neg_sample, neutral_sample], ignore_index=True).sample(frac=1, ignore_index=True, random_state=10)

# write to Excel file for manual labeling
sample_sentences.to_excel("project_excel_sentences.xlsx")