## News website scraper

Goes through each domain obtained from http://www.abyznewslinks.com/ and extract the URLs containing more than 5 "-". Those would normally be press article and the title of the article is usually the same as in the url. We then open the urls one by one to extract the "amphtml" tag. We limit the number of AMP pages discovered by domain to 10 as sample (per domain). From the original amp url (found in the source of each AMP-enabled news article, we can build the "amp_viewer_url" and the "AMP Cache url".

In [2]:
import pandas as pd
import numpy as np
import requests
from scrapy.http import TextResponse
import re

#USER_AGENT = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58: .0.3029.110 Chrome/58.0.3029.110 Safari/537.36'}
USER_AGENT = {'User-Agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/60.0.3112.107 Mobile Safari/537.36'}

def getArticleLinks(domain):
    url = "http://" + domain    
    
    links = []
    series = pd.Series(data=links)
    
    try:    
        r = requests.get(url, headers=USER_AGENT)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        return series
    
    response = TextResponse(r.url, body=r.text, encoding='utf-8')
    c = response.xpath('//a[contains(@href, "-")]/@href').extract()
    #c = response.xpath('//a/@href').extract()
        
    my_regex = r"^https+://.*" + re.escape(domain) + r".*"
    #my_regex1 = r".*" + re.escape(domain) + r"/.*"
    #my_regex2 = r".*" + re.escape(domain) + r"/\d+.html"
    
    for link in c:
        hyphens = link.count('-')
        
        #if link has more than 5 hyphens, it is very likely it is a news link
        if (hyphens > 5):
            #if found most likely it has the http(s) in there too
            #if (re.match(my_regex,link, re.IGNORECASE)):
                if ('http' in link):
                    links.append(link)
                else:
                    links.append("http://" + domain + '/' + link)
        
        #if (re.search(my_regex2, link, re.IGNORECASE)):
        #    print(link)
    series = pd.Series(data=links)
    series = series.drop_duplicates(keep='first')
    
    return series

def getAMPUrl(link):
    c = None
    
    try:    
        r = requests.get(link, headers=USER_AGENT)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        return c
        
    response = TextResponse(r.url, body=r.text, encoding='utf-8')
    c = response.xpath('//link[contains(@rel, "amphtml")]/@href').extract()
    
    return c

def getSampleLinksByCC(df, cc, n):
    df_cc = df.loc[df['cc']==cc]
    if (len(df_cc) < n):
        return df_cc
    else:
        return df_cc.sample(n=n)

In [3]:
df_domains = pd.read_csv('data/domain.csv', encoding='latin1')
df_domains = df_domains.iloc[:,[0,2,8, 10, 11]]

In [None]:
df_links = pd.DataFrame(columns=['domain','cc','geoloc_cc','url','ori_amp_url', 'amp_viewer_url', 'amp_cdn_url'])

for index, row in df_domains.iterrows():
    links = getArticleLinks(row['domain'])
    
    if (links is None or links.size < 1):
        continue
    
    count_amp = 0
    count_amp_notfound = 0
    
    for index, url in links.iteritems():
        ori_amp_url = getAMPUrl(url)        
        if (ori_amp_url is None or len(ori_amp_url) == 0):
            
            count_amp_notfound = count_amp_notfound + 1
            
            if (count_amp_notfound > 3):
                break
            
            continue
        else:
            count_amp = count_amp + 1 
            found = ""
            m = re.search('https?://(.*)', ori_amp_url[0])
            if m:
                 found = m.group(1)
  
            amp_viewer_url = "https://www.google.com/amp/s/" + found
            amp_cdn_url = "https://" + row['domain'].replace('.','-') + ".cdn.ampproject.org/c/s/" + found
                         
            #we only want 10 links per domain
            if (count_amp > 10):
                break
            
#             print("#################    %s    ############" % row['domain'] )
#             print("url=" + url)
#             print("ori_amp_url=" + ori_amp_url[0])
#             print("amp_viewer_url=" + amp_viewer_url)
#             print("amp_cdn_url=" + amp_cdn_url)
            
            df_links = df_links.append({'domain': row['domain'], 
                             'cc': row['cc'], 
                             'geoloc_cc': row['geoloc_cc'], 
                             'url': url, 
                             'ori_amp_url': ori_amp_url[0], 
                             'amp_viewer_url': amp_viewer_url, 
                             'amp_cdn_url' : amp_cdn_url}, 
                            ignore_index=True)
        
#df_links.drop_duplicates(subset=['ori_amp_url','amp_viewer_url','amp_cdn_url'], keep='first', inplace=False)        

df_links.to_csv('data/links.csv', sep='|', encoding='utf-8', index=False)

In [4]:
df_links = pd.read_csv('data/links.csv',sep='|', encoding='utf-8', keep_default_na=False)

In [6]:
df_links_sample = pd.DataFrame(columns=df_links.columns)

for cc in df_links['cc'].drop_duplicates():
    df_cc = getSampleLinksByCC(df_links, cc, 15)
    df_links_sample = df_links_sample.append(df_cc, ignore_index=True)

df_links_sample.to_csv('data/links_sample4.csv', sep='|', encoding='utf-8', index=False)

In [None]:
df_domains.groupby('cc')['domain'].nunique()

In [None]:
df_links_sample.groupby('cc')['url'].nunique()

In [5]:
df_links['domain'].drop_duplicates()

0                         263chat.com
10              africanreporter.co.za
15               albertonrecord.co.za
18                     alexnews.co.za
24                        aminata.com
34      bedfordviewedenvalenews.co.za
39              benonicitytimes.co.za
43                    bereamail.co.za
44                 bizwatchnigeria.ng
54      blanknewsonline.wordpress.com
64           boksburgadvertiser.co.za
70                brakpanherald.co.za
76                      buzzkenya.com
86                    buzznigeria.com
96                      citizen.co.za
106                    citybuzz.co.za
110             comarochronicle.co.za
113         completesportsnigeria.com
123               crossriverwatch.com
133                      dailypost.ng
143                  dailystar.com.ng
153                       diplomat.so
163                         ewn.co.za
173      flashpointnews.wordpress.com
183              fourwaysreview.co.za
189           germistoncitynews.co.za
198         