## Scraping CNA Articles by Topic
There are 2 steps to scraping the CNA articles, in the first portion, we will be scraping article links only for each topic. After obtaining the links, we then pass through each link to sieve out the necessary information. 

In [3]:
# ! pip install beautifulsoup4
# ! pip install requests

import urllib.request,sys,time
import os
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd

import regex as re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk import FreqDist

from nltk.corpus import stopwords
import gensim
from gensim.parsing.preprocessing import STOPWORDS

#### Reading csv file of a manually compiled list of links for different topics

In [4]:
cna_links = pd.read_csv('../data/Trending Topics/links2.txt')
cna_links

Unnamed: 0,topic,cna_search_link,num_pages
0,sports,https://www.channelnewsasia.com/topic/sports?s...,50
1,science,https://www.channelnewsasia.com/topic/science?...,6
2,law,https://www.channelnewsasia.com/topic/law?sort...,10


### Part 1: Creating a function to scrape the URLs

The main url we are scraping from changes for each topic, where articles tagged as under the topic will appear under the url. We will interate through the different pages of the url by changing the ending page number.

The function loops through the pages using the pageNum parameter.

Please update the headers variable with your own user agent. This prevents us from running into error 403: Forbidden when we scrape the URLs. 


To do so, 
1. Press F12 to navigate to the Chrome developer console.
2. Type in <code>navigator.userAgent</code> in the console and execute it by hitting enter.
3. Copy over your user agent and replace the value with your own in the headers dictionary.

For more details, refer to https://stackoverflow.com/questions/38489386/python-requests-403-forbidden

In [5]:
def get_urls(topic, link, ending_page):
    
    #rename csv as name_urls.csv
    with open(f'../data/Trending Topics/cna_urls/{topic}_cna_urls.csv', 'w', newline='') as file: #create a csv to input scrapped urls
        writer = csv.writer(file)
        writer.writerow(["Page", "URL"]) #create first header row with the column names as "Page" to indicate page number and "URL"
    
        for i in range(0, ending_page+1):
            try:
                url = link + str(i)
                headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"}
                
                result = requests.get(url, headers=headers)

                soup=BeautifulSoup(result.text,'html.parser')

                article_urls = soup.findAll('a', attrs={'class' : 'h6__link h6__link-- list-object__heading-link'}) 
                for url in article_urls:
                    #add row in csv with the page number and scrapped url
                    writer.writerow([i, 'https://www.channelnewsasia.com'+url['href']])                                                                                        #link['href'] only gives us the relative path, not the absolute path, so we need to add the missing domain
                                                                                    

            except Exception as e:
                print('exception')
                error_type, error_obj, error_info = sys.exc_info()      # get the exception information
                print ('ERROR FOR LINK:',url)                          #print the link that cause the problem
                print (error_type, 'Line:', error_info.tb_lineno)     #print error info and line that threw the exception
                continue                                              #ignore this page. Abandon this and go back.

            time.sleep(2) 

In [6]:
for topic, link, num_pages in zip(cna_links["topic"], cna_links["cna_search_link"], cna_links["num_pages"]):
    get_urls(topic, link, num_pages)
    print(f"{topic} completed")

sports completed
science completed
law completed


### Part 2: Getting information from each link

Different functions are written to obtain specific information, by accessing the HTML tag on the CNA webpage.

In [7]:
def get_title(soup):
    title = soup.find("h1", attrs={'h1 h1--page-title'})

    return title
        
def get_text(soup):
    text = ''
    article = soup.find_all("div", attrs={"class": "text-long"})

    for i in range(len(article)):
        each_class = article[i]
        articleParagraph = each_class.find_all("p")
        for i in range(len(articleParagraph)):
            text += articleParagraph[i].text + '\n'
            
    return text

def get_related_topics(soup):
    other_keywords = []
    
    related_topics = soup.find("section", attrs={"class": "block block-layout-builder block-field-blocknodearticlefield-topics clearfix block--related-topics"})
    
    try:
        tags = related_topics.find_all("a")

        for tag in tags:
            tag_text = tag.text
            clean_tag = tag_text.replace("\n ", "")
            other_keywords.append(clean_tag)

    except Exception as e: # when article does not have any related topic tags
        other_keywords.append("")

    return other_keywords

In [8]:
def get_url_data(url):
    url_data = {}

    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"}
    result = requests.get(url, headers=headers)
    
    if result.status_code != 404:
    
        soup=BeautifulSoup(result.text,'html.parser')

        url_data['title'] = get_title(soup)
        url_data['text'] = get_text(soup)
        url_data['related_topics'] = get_related_topics(soup)

    return url_data

In [9]:
PATH = "../data/Trending Topics/cna_urls"

# Obtaining article title, text and related topic keywords for each article 
for file in os.listdir(PATH):
    topic = file.split("_")[0] 
    url_df = pd.read_csv(os.path.join(PATH, file))
    
    url_df['data'] = url_df.apply(lambda x: get_url_data(x['URL']), axis=1)

    url_df.to_csv(f"../data/Trending Topics/cna_text/{topic}_text.csv")

## Creating dictionary for each topic
After collecting a repository of words from CNA articles as the source, we will select the top 1000 frequently used words for each topic as a dictionary of words for that topic. The dictionary will serve as a word bank for each topic to tag social media text to their topics.

#### Processing text to remove unnecessary HTML tags and stopwords

##### Combining Stopwords from different libraries
- NLTK
- spaCy (cannot get the package installed)
- Gensim
- scikit-learn

https://towardsdatascience.com/text-pre-processing-stop-words-removal-using-different-libraries-f20bac19929a

In [10]:
nltk_stopwords = set(stopwords.words('english'))

In [11]:
spacy_stopwords = {'‘ll', 'a', 'several', 'three', 'ten', 'few', 'you', 'since', 'until', 'under', 'namely', 'along', 'while', 'bottom', 'also', "'ll", 'this', '’re', 'anyone', 'whoever', 'already', 'their', 'who', 'third', 'therein', 'its', 'say', '’ve', 'fifteen', 'down', 'whole', 'doing', 'can', 'became', 'whereafter', 'eleven', 'during', 'call', 'formerly', 'via', 'on', 'almost', 'not', 'her', 'he', 'make', 'every', 'ca', 'as', 'nine', 'with', 'see', 'n‘t', 'anything', 'our', 'into', 'part', 'by', 'side', 'own', 'another', 'twenty', 'myself', 'other', 'next', 'over', 'regarding', 'afterwards', 'the', 'again', 'himself', 'nothing', 'even', 'an', 'that', "'d", 'meanwhile', '‘d', 'ever', 'has', 'show', '‘s', 'seems', 'so', 'back', 'everyone', '‘ve', 'nowhere', 'yet', 'she', 'both', 'my', 'per', 'be', 'thereupon', 'above', 'them', 'were', 'thereby', 'his', 'we', 'but', "'s", 'becomes', 'him', 'though', 'whether', 'each', 'fifty', 'yours', 'us', 'here', 'how', 'me', 'some', 'quite', 'anyway', 'former', 'wherever', 'onto', 'put', 'where', 'moreover', 'well', 'at', 'one', '’ll', 'around', 'amongst', 'no', 'is', 'mine', 'becoming', 'very', 'was', 'cannot', 'had', 'whenever', 'sometime', 'for', 'they', 'using', 'seemed', 'such', 'thru', 'among', 'hers', 'within', '‘re', 'various', 'front', '‘m', 'alone', 'do', 'then', 'about', 'behind', 'five', 'i', 'always', 'towards', 'least', 'themselves', 'whereby', 'name', 'go', 'whatever', 'upon', 'wherein', 'because', 'latterly', 'to', 'others', 'all', 'first', 'why', 'through', 'herself', 'what', 'somehow', 'in', 'might', 'been', 'mostly', 'elsewhere', 'unless', 'after', 'latter', 'throughout', 'if', 'four', 'hence', 'which', 'yourselves', 'often', 'am', 'perhaps', 'whom', 'are', 'seeming', 'top', 'something', 'amount', 'whereupon', 're', 'get', 'however', 'made', 'done', "'re", 'hereby', 'against', 'anyhow', 'become', 'does', 'keep', 'out', 'your', 'many', 'may', 'must', 'yourself', 'sixty', 'just', 'between', 'toward', 'whereas', "'m", 'last', 'twelve', 'empty', 'would', 'someone', 'otherwise', 'have', 'without', 'nobody', '’s', 'only', 'six', 'and', 'most', 'more', 'besides', 'two', 'although', 'seem', 'everywhere', 'eight', 'should', 'too', 'n’t', 'beforehand', 'now', 'together', 'somewhere', 'whose', 'enough', 'neither', 'nevertheless', '’m', 'still', 'same', 'sometimes', 'due', 'ours', 'either', 'being', 'hereupon', 'when', 'these', 'further', 'noone', 'any', 'therefore', 'there', "n't", 'beside', 'off', 'it', 'thereafter', 'never', 'those', 'from', 'really', 'could', 'whence', 'take', 'below', 'herein', 'beyond', '’d', 'move', 'thence', 'anywhere', 'thus', 'hereafter', 'everything', 'except', "'ve", 'whither', 'up', 'of', 'across', 'or', 'will', 'else', 'before', 'rather', 'none', 'full', 'indeed', 'hundred', 'serious', 'used', 'did', 'once', 'nor', 'forty', 'much', 'please', 'less', 'give', 'ourselves', 'itself', 'than'}

In [12]:
gensim_stopwords = STOPWORDS

In [13]:
print("Number of stopwords from nltk:", len(nltk_stopwords))

nltk_stopwords.update(spacy_stopwords)
print("Number of stopwords from nltk + spaCy:", len(nltk_stopwords))

nltk_stopwords.update(gensim_stopwords)
print("Number of stopwords from nltk + spaCy + gensim:", len(nltk_stopwords))

combined_stopwords = set(nltk_stopwords)
print("Total number of stopwords:", len(combined_stopwords))

Number of stopwords from nltk: 179
Number of stopwords from nltk + spaCy: 382
Number of stopwords from nltk + spaCy + gensim: 412
Total number of stopwords: 412


In [14]:
additional_stopwords = {'singapore', 'http', 'cna', 'said', 'download',  'app', 'subscribe', 'telegram', 'channel', 'latest', 'including', 'good'}
combined_stopwords.update(additional_stopwords)

print("Updated number of stopwords:", len(combined_stopwords))

Updated number of stopwords: 424


In [15]:
def processing(text):
    text = text.replace("{'title': <h1 class=\"h1 h1--page-title\">", " ").replace("\n", " ").replace("</h1>", " ").replace("\'text\':", " ").replace("\\xa0", " ").replace("\\n", " ").replace("'related_topics': ['     ", " ").replace("download app subscribe telegram channel latest", " ").replace("outbreak http asia", " ").replace("coronavirus outbreak http", " ").replace("http", " ")
    wnl = WordNetLemmatizer()

    lowercase_text = text.lower()
    punctuations_removed = re.sub('[^a-z]', ' ', lowercase_text)
    tokens = word_tokenize(punctuations_removed)
    tokens = [token for token in tokens if len(token) !=1]
    stopwords_removed = [token for token in tokens if token not in combined_stopwords]
    lemmatized_tokens = [wnl.lemmatize(w) for w in stopwords_removed]

    return " ".join(lemmatized_tokens)

#### Creating the dictionary of words for each topic

In [16]:
def extract_bigrams(tokens):
    bigram = ngrams(tokens, 2)
    bigram_fdist = FreqDist(bigram).most_common(100)

    bigram_list = []
    for bigram in bigram_fdist:
        bigram_list.append(bigram[0][0] + ' ' + bigram[0][1])

    return bigram_list

def extract_trigrams(tokens):
    trigram = ngrams(tokens, 3)
    trigram_fdist = FreqDist(trigram).most_common(100)
    
    trigram_list = []
    for trigram in trigram_fdist:
        trigram_list.append(trigram[0][0] + ' ' + trigram[0][1] + ' ' + trigram[0][2])

    return trigram_list

In [20]:
for file in os.listdir("../data/Trending Topics/cna_text"): # read article texts for each topic
    topic = file.split("_")[0]
    print(f"Reading text from topic: {topic}")

    article_df = pd.read_csv(f"../data/Trending Topics/cna_text/{file}", index_col=0)
    article_df["processed"] = article_df["data"].apply(processing)
    
    with open(f"../data/Trending Topics/dictionary/{topic}.csv", 'w') as file: # create new csv under dictionary folder to input top 1000 topical words for each topic
        writer = csv.writer(file)
        text = ''
        for article_text, data in zip(article_df["processed"], article_df["data"]): # loop through all rows of article text
            text += article_text # add all words from the article to a variable text 

        tokens = word_tokenize(text) # tokenize text and returns list
        freq_dist = FreqDist(tokens).most_common(500) # returns a list of tuples (word, freq)
        combined_list = [word_freq[0] for word_freq in freq_dist]
        combined_list += extract_bigrams(tokens)
        combined_list += extract_trigrams(tokens)

        writer.writerow(combined_list)

    print("All words added\n")

Reading text from topic: law
All words added

Reading text from topic: science
All words added

Reading text from topic: sports
All words added



### Additional topic words

#### Education

ministry of education, gifted education programme, special assistance, financial assistance, poly, primary 1, primary 2, primary 3, primary 4, primary 5, primary 6, p1, p2, p3, p4, p5, p6, secondary 1, secondary 2, secondary 3, secondary 4, secondary 5, junior college, jc, uni, olevel, olevels, o level, o levels, nlevel, nevels , n level, n levels, a level, alevels, alevel, a levels, pe, nursery, primary education, homeschool, home based learning, mother tongue, primary school leaving examination, dsa, direct school admission, singapore polytechnic, singapore poly, temasek polytechnic, temasek poly, nanyang polytechnic, nanyang poly, sex education

#### COVID-19
covid, corona, coronavirus, corona virus, covid19, asymptomatic, case fatality rate, clinical trial, community spread, confirmed positive case, contact tracing, tracetogether, contactless, epidemic, pandemic, epidemiology, essential business, herd immunity, immunosuppressed, incubation, incubation period, lockdown, national emergency, outbreak, transmit, transmission, variant, alpha, beta, gamma, delta, omicron, respirator, frontline, frontline workers, frontliners, health workers, healthcare workers, self isolation, quarantine, social distancing, safe distancing, wuhan, symptomatic, vaccine, vaccination, pfizer, moderna, sinovac, booster, work from home, working from home, border, world health organisation, vtl, vaccinated travel lane, travel bubble, travel restrictions, border closure, border reopening, ease restrictions, tighten restrictions, ministry of health, moh, local cases, imported cases, pcr test, art test, test kit, testing centre, swab test, positive cases, surgical masks, medical masks

#### Overlapping words

In [5]:
with open('../data/Trending Topics/dictionary/art.csv') as f:
    art_dict = set([i for i in f][0].split(","))
with open('../data/Trending Topics/dictionary/covid19.csv') as f:
    covid19_dict = set([i for i in f][0].split(","))
with open('../data/Trending Topics/dictionary/education.csv') as f:
    edu_dict = set([i for i in f][0].split(","))
with open('../data/Trending Topics/dictionary/environment.csv') as f:
    env_dict = set([i for i in f][0].split(","))
with open('../data/Trending Topics/dictionary/fashion.csv') as f:
    fashion_dict = set([i for i in f][0].split(","))
with open('../data/Trending Topics/dictionary/food.csv') as f:
    food_dict = set([i for i in f][0].split(","))
with open('../data/Trending Topics/dictionary/health.csv') as f:
    health_dict = set([i for i in f][0].split(","))
with open('../data/Trending Topics/dictionary/politics.csv') as f:
    politics_dict = set([i for i in f][0].split(","))
with open('../data/Trending Topics/dictionary/technology.csv') as f:
    tech_dict = set([i for i in f][0].split(","))

In [6]:
topic_dicts = [('art', art_dict), ('covid19', covid19_dict), ('education', edu_dict), ('environment', env_dict), ('fashion', fashion_dict), ('food', food_dict), ('health', health_dict), ('politics', politics_dict), ('technology', tech_dict)]

In [22]:
def find_overlaps(topic_dict):
    topic_overlapped_list = []
    for i in range(len(topic_dict)):
        topic_overlapped = set()
        for k in range(i+1, len(topic_dict)):
            overlapping_words = list(topic_dict[i][1].intersection(topic_dict[k][1]))
            topic_overlapped.update(overlapping_words)
            # print(f"{topic_dict[i][0]} & {topic_dict[k][0]}: {overlapping_words}")
            # print("")
            topic_overlapped_list.append(topic_overlapped)
    return topic_overlapped_list

In [33]:
overlapped_words = find_overlaps(topic_dicts)
overlapped_words[1]

with open('../data/Trending Topics/dictionary/art.csv') as f:
    writer = csv.writer(f)
    art_dict = set([i for i in f][0].split(","))
    art_dict = [word for word in art_dict if word not in overlapped_words[1]]
    writer.writerow(art_dict)

UnsupportedOperation: not writable