## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [141]:
# $ python3 -m venv venv
# $ . ./venv/bin/activate

In [142]:
#Better
#!pip install requests BeautifulSoup4 fire

In [11]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import re
import os, sys

import fire

In [4]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

In [102]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa', tag = 'h2')

In [135]:
res_cleaned = []
for element in res:
    if re.findall("@", element):
        res_cleaned.append(element)
        
        
df = pd.DataFrame(res_cleaned)
df1 = df[0].str.split('@', expand = True)
#df2 = df1.set_index(0)
influencers = df1[1].tolist()

final = []
for influencer in influencers:
    influencer = influencer.strip(')')
    final.append('@'+influencer)
    
final[:5]

['@gettleman', '@a24media', '@andiMakinana', '@AfricaCheck', '@JamesCopnall']

In [136]:
import csv

In [138]:
file = open('africa_influencers.csv', 'w+', newline = '')

with file:
    write = csv.writer(file, delimiter = ' ',quotechar='|', quoting=csv.QUOTE_MINIMAL)
    write.writerows(final)

### 10 Twitter handles from africafreak.com

In [140]:
afr_influencers = pd.read_csv('africa_influencers.csv').sample(10)
afr_influencers

Unnamed: 0,Twitter handles
98,@ G a r e t h C l i f f
69,@ a r t 2 g e e
70,@ J e n d a y i F r a z e r
9,@ B r e n d a n S A f r i c a
15,@ T h e E I U _ A f r i c a
83,@ C a m f e d
82,@ B o b S k i n s t a d
51,@ t h e a f r i c a r e p o r t
24,@ t _ m c c o n n e l l
48,@ T h e S t a r _ n e w s


In [13]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = simple_get(url)

In [143]:
res_gov = get_elements(response, search={'find_all':{'class_':'twitter-tweet'}})

res_gov_cleaned = []
for element in res_gov:
    if "@" in element:
        res_gov_cleaned.append(element)

df_gov = pd.DataFrame(res_gov_cleaned)
df1_gov = df_gov[0].str.split('@', expand = True)
afr_leaders = df1_gov[1].tolist()

leaders_final = []
for leader in afr_leaders:
    leader = leader.strip()
    leader = leader.strip(')')
    leaders_final.append('@'+leader)
    
leaders_final[:5]

findaing all of {'class_': 'twitter-tweet'}


['@EswatiniGovern1',
 '@MalawiGovt',
 '@hagegeingob',
 '@FinanceSC',
 '@PresidencyZA']

In [144]:
file1 = open('africa_leaders.csv', 'w+', newline = '')

with file1:
    write = csv.writer(file1, delimiter = ' ',quotechar='|', quoting=csv.QUOTE_MINIMAL)
    write.writerows(leaders_final)

### 10 Twitter handles from atlanticcouncil.org

In [145]:
afr_leaders = pd.read_csv('africa_leaders.csv').sample(10)
afr_leaders

Unnamed: 0,Twitter handles
17,@ w i l l y n y a m i t w e
35,@ M S P S _ T o g o
33,@ M a c k y _ S a l l
14,@ T Z S p o k e s p e r s o n
18,@ C h e r i f _ M Z
10,@ P a u l K a g a m e
22,@ r o c h k a b o r e p f
31,@ I s s o u f o u M h m
1,@ M a l a w i G o v t
26,@ N A k u f o A d d o


## Searching and Downloading Tweets by Africa Influencers and Leaders

In [112]:
import json
import matplotlib.pyplot as plt
import string

#Import necessary methods from tweepy library
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import Cursor
from tweepy import API

#sentiment analysis package
from textblob import TextBlob

#general text pre-processor
#import nltk
#from nltk.corpus import stopwords
#nltk.download('punkt')

#tweet pre-processor
import preprocessor as p

In [154]:
 #Variables that contains the user credentials to access Twitter API 
consumer_key = '#'
consumer_secret = '#'
access_token = '#'
access_token_secret = '#'
            


#This handles Twitter authetification and the connection to Twitter Streaming API
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
            
auth_api = API(auth) 

In [147]:
#!pip install tabulate
from tabulate import tabulate

In [153]:
count = 0
for influencer in final:
    if count < 1:
        print ('GETTING DATA FOR ' + influencer)
        item = auth_api.get_user(influencer)
        print ("Influencer : " + item.name)
        print("Handle : " + item.screen_name)
        print("tweets_count : " + str(item.statuses_count))
        print("following_count : " + str(item.friends_count))
        print("followers_count : " + str(item.followers_count))
        #print("Likes : " + item.favorite_count)
        #print("Retweets : " + item.retweets)
    count += 1

GETTING DATA FOR @gettleman
Influencer : Jeffrey Gettleman
Handle : gettleman
tweets_count : 3767
following_count : 37
followers_count : 25700


## Web scrapping using bash script
If the web site has a quite simple HTML, you can easily use curl to perform the request and then extract the needed values using bash commands grep, cut , sed, ..

This tutorial is adapted from [this](https://medium.com/@LiliSousa/web-scraping-with-bash-690e4ee7f98d) medium article

In [64]:
%%bash 

# curl the page and save content to tmp_file
#url = "https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa"
#curl -X GET $url -o tmp_file


#!/bin/bash

# write headers to CSV file
echo "Name, twitter_id" >> extractData.csv
n="1"
while [ $n -lt 2 ]
do
  
  #get title
  title=$(cat tmp_file | grep "class=\"twitter-tweet\"" | cut -d ';' -f1 )
  echo $title
  #get author
  #twitter_id=$(cat tmp_file |grep -A1 "class=\"css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0\"" | tail -1)

  #echo "$title, $twitter_id" >> extractData.csv
  #echo "$title, $twitter_id"
    
  n=$[$n+1]

done

<blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">The Deputy Prime Minister Themba Masuku has today met representatives of the private sector and employees&#39 <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">GUIDELINES FOR SCHOOLS IN <a href="https://twitter.com/hashtag/MALAWI?src=hash&amp <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">Fellow Namibians, I declared a State of Emergency on <a href="https://twitter.com/hashtag/COVID19?src=hash&amp <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr"><a href="https://twitter.com/hashtag/COVID19measuresSC?src=hash&amp <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">The Minister for Cooperative Governance &amp <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">Join the <a href="https://twitter.com/hasht