## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [1]:
# $ python3 -m venv venv
# $ . ./venv/bin/activate

In [2]:
#Better
#!pip install requests BeautifulSoup4 fire

In [57]:
#imports
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys

import fire

In [58]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('finding all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

### Scraping the data for the top 100 most influential twitter users in Africa

In [59]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa',tag='h2')
res

['100. Jeffrey Gettleman (@gettleman)',
 '99. Africa24 Media (@a24media)',
 '98. Scapegoat (@andiMakinana)',
 '97. Africa Check (@AfricaCheck)',
 '96. James Copnall (@JamesCopnall)',
 '95. Online Africa (@oafrica)',
 '94. Patrick Ngowi (@PatrickNgowi)',
 '93. DOS African Affairs (@StateAfrica)',
 '92. MoadowAJE (@Moadow)',
 '91. Brendan Boyle (@BrendanSAfrica)',
 '90. City of Tshwane (@CityTshwane)',
 '89. VISI Magazine (@VISI_Mag)',
 '88. andBeyond (@andBeyondSafari)',
 '87. This Is Africa (@ThisIsAfricaTIA)',
 '86. Sarah Carter (@sarzss)',
 '85. The EIU Africa team (@TheEIU_Africa)',
 '84. Investing In Africa (@InvestInAfrica)',
 '83. Barry Malone (@malonebarry)',
 '82. ARTsouthAFRICA (@artsouthafrica)',
 '81. Kahn Morbee (@KahnMorbee)',
 '80. Jamal Osman (@JamalMOsman)',
 '79. iamsuedeâ„¢ (@iamsuede)',
 '78. Mike Stopforth (@mikestopforth)',
 '77. Equal Education (@equal_education)',
 '76. Tristan McConnell (@t_mcconnell)',
 '75. Kate Forbes (@forbeesta)',
 '74. Vanessa Raphaely (@h

### Saving the top 10 most influential twitter users to a CSV file

In [69]:
new = pd.DataFrame(res).head(100)
new
#Data manipulation
df1 = new[0].str.split('.', expand=True)
df1.head(100)
df2 = df1[1].str.split('(', expand=True)
df2.head(100)
df2[1] = df2[1].str.strip(')')
df2.head(100)
df2.columns = ['Twitter_name','Twitter_handle']

twitter_handle = df2["Twitter_handle"].astype(str).tolist()

twitter_handle = twitter_handle[::-1]
twitter_handle.pop(2)
sliced_list =  twitter_handle[:10]
sliced_list
df = pd.DataFrame(sliced_list)
df.columns = ["Twitter_Handle"]
df.to_csv("top10_most_influential_twitter_users.csv", index=False)

### Script for mining twitter user's data by user_timeline.

In [21]:
from __future__ import unicode_literals
import sys
import os
import json
import pandas as pd 
import matplotlib.pyplot as plt
import re
import string

import matplotlib.dates as mdates
import seaborn as sns
sns.set()

# to view all columns
pd.set_option("display.max.columns", None)

# Import the necessary methods from tweepy library

import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

# Sentiment analysis package
from textblob import TextBlob

# general text pre-processor

import nltk
from nltk.corpus import stopwords
nltk.download('punkt')

# tweet pre-processor
import preprocessor as p

'''
def print_full(x):
     # This is to print nicely DataFrame wide tables
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display')'

'''

class tweetsearch():

    def __init__(self, cols=None, auth=None):

        if not cols is None:
            self.cols = cols
        else:
            self.cols = [
                'id', 'created_at', 'source', 'original_text', 'clean_text',
                'sentiment', 'polarity', 'subjectivity', 'followers', 'following',
                'favorite_count', 'retweet_count', 'lang', 'hashtgs', 'user_mentions', 
                'place', 'place_coordinates', 'original_author'
            ]

        if auth is None:             
            consumer_key = 'RKDn1CzM8N7oYdnlB9rrKj6p2'
            consumer_secret = 'cSAxjTNoBA3uhscKNPJQZvYYnkzqyROBfrRebFhlojuzLWqubb'
            access_token = '888342306084007936-QQnz4vXq9xZIiO4yswM7ld4klLt3qu9'
            access_token_secret = 'Ct2T6hBm91DeWYZnsQj31amefIHcJe8xQ6pZoprtV6Z4Y'

            # This handles Twitter authentication 
            auth = OAuthHandler(consumer_key, consumer_secret)
            auth.set_access_token(access_token, access_token_secret)

        self.auth = auth
        self.api = tweepy.API(auth, wait_on_rate_limit=True)
        self.filtered_tweet = ''

    def clean_tweets(self, twitter_text):

        # use pre processor
        tweet = p.clean(twitter_text)

        #Happy Emoticons
        emoticons_happy = set([
            ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
            ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
            '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
            'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
            '<3'
            ])

        # Sad Emoticons
        emoticons_sad = set([
            ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
            ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
            ':c', ':{', '>:\\', ';('
            ])

        #Emoji patterns
        emoji_pattern = re.compile("["
                 u"\U0001F600-\U0001F64F"  # emoticons
                 u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                 u"\U0001F680-\U0001F6FF"  # transport & map symbols
                 u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                 u"\U00002702-\U000027B0"
                 u"\U000024C2-\U0001F251"
                 "]+", flags=re.UNICODE)

        # combine sad and happy emoticons
        emoticons = emoticons_happy.union(emoticons_sad)

        stop_words = set(stopwords.words('english'))
        word_tokens = nltk.word_tokenize(tweet)

        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'â€šÃ„Â¶', '', tweet)

        #replace consecutive non-ASCII characters with a space
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)

        #remove emojis from tweet
        tweet = emoji_pattern.sub(r'', tweet)

        #filter using NLTK library append it to a string
        filtered_tweet = [w for w in word_tokens if not w in stop_words]

        # looping through conditions
        filtered_tweet = []
        for w in word_tokens:
            # check tokens againts stop words, emoticons and punctuations
            if w not in stop_words and w not in emoticons and w not in string.punctuation:
                filtered_tweet.append(w)

        return ' '.join(filtered_tweet)

    def get_tweets(self, name, csvfile=None):

        df = pd.DataFrame(columns=self.cols)

        if not csvfile is None:
            # If the file exists, then read the existing data from the CSV file
            if os.path.exists(csvfile):
                df = pd.read_csv(csvfile, header=0)

        # page attribute in tweepy.cursor and iteration
        for page in tweepy.Cursor(self.api.user_timeline, id=name).pages(1):

            for status in page:
                new_entry = []
                status = status._json

                # filter by language
                if status['lang'] != 'en':
                    continue

                # if this tweet is a retweet update retweet count
                if status['created_at'] in df['created_at'].values:
                    i = df.loc[df['created_at'] == status['created_at']].index[0]

                    cond1 = status['favorite_count'] != df.at[i, 'favorite_count']
                    cond2 = status['retweet_count'] != df.at[i, 'retweet_count']
                    if cond1 or cond2:
                        df.at[i, 'favorite_count'] = status['favorite_count']
                        df.at[i, 'retweet_count'] = status['retweet_count']
                    continue

                # calculate sentiment
                filtered_tweet = self.clean_tweets(status['text'])
                blob = TextBlob(filtered_tweet)
                Sentiment = blob.sentiment
                polarity = Sentiment.polarity
                subjectivity = Sentiment.subjectivity

                new_entry += [status['id'], status['created_at'],
                              status['source'], status['text'], filtered_tweet,
                              Sentiment, polarity, subjectivity, status['lang'],
                              status['favorite_count'], status['retweet_count']]

                new_entry.append(status['user']['screen_name'])

                hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
                new_entry.append(hashtags) # append the hashtags

                mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
                new_entry.append(mentions) # append user mentions
                
                try:
                    followers = status['user']['followers_count']
                except TypeError:
                    followers = None

                new_entry.append(followers)

                try: 
                    following = status['user']['following']
                except TypeError:
                    following = None

                new_entry.append(following)
                

                try:
                    xyz = status['place']['bounding_box']['coordinates']
                    coordinates = [coord for loc in xyz for coord in loc]
                except TypeError:
                    coordinates = None

                new_entry.append(coordinates)

                try:
                    location = status['user']['location']
                except TypeError:
                    location = ''

                new_entry.append(location)

                # appending a row to the data frame
                single_user_df = pd.DataFrame([new_entry], columns=self.cols)
                df = df.append(single_user_df, ignore_index=True)

        if not csvfile is None:
            df.to_csv(csvfile, columns=self.cols, index=False, encoding="utf-8")

        return df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\briodev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
tweetsfile = "popularafrica_influencer.json"

ts = tweetsearch()

for handle in twitter_handle:
    df = ts.get_tweets(handle, csvfile=tweetsfile)

TweepError: Twitter error response: status code = 404

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1012 entries, 0 to 1011
Data columns (total 18 columns):
id                   1012 non-null int64
created_at           1012 non-null object
source               1012 non-null object
original_text        1012 non-null object
clean_text           1010 non-null object
sentiment            1012 non-null object
polarity             1012 non-null float64
subjectivity         1012 non-null float64
followers            1012 non-null object
following            1012 non-null int64
favorite_count       1012 non-null int64
retweet_count        1012 non-null object
lang                 239 non-null object
hashtgs              693 non-null object
user_mentions        1012 non-null int64
place                1012 non-null bool
place_coordinates    9 non-null object
original_author      730 non-null object
dtypes: bool(1), float64(2), int64(4), object(11)
memory usage: 135.5+ KB


### Getting the data for African leaders response to corona virus

In [72]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = simple_get(url)

In [73]:
res = get_elements(response, search={'find_all':{'class_':'twitter-tweet'}})
res

finding all of {'class_': 'twitter-tweet'}


[<p dir="ltr" lang="en">The Deputy Prime Minister Themba Masuku has today met representatives of the private sector and employees' unions to map a collaborative effort in the fight against <a href="https://twitter.com/hashtag/COVID19?src=hash&amp;ref_src=twsrc%5Etfw">#COVID19</a>. <a href="https://t.co/EIYNGOEKRN">pic.twitter.com/EIYNGOEKRN</a></p>,
 'â€” Eswatini Government (@EswatiniGovern1) ',
 <a href="https://twitter.com/EswatiniGovern1/status/1241038139889721346?ref_src=twsrc%5Etfw">March 20, 2020</a>,
 <p dir="ltr" lang="en">GUIDELINES FOR SCHOOLS IN <a href="https://twitter.com/hashtag/MALAWI?src=hash&amp;ref_src=twsrc%5Etfw">#MALAWI</a> ON THE PREVENTION AND MANAGEMENT OF <a href="https://twitter.com/hashtag/COVID19?src=hash&amp;ref_src=twsrc%5Etfw">#COVID19</a> <a href="https://twitter.com/hashtag/CORONAVIRUS?src=hash&amp;ref_src=twsrc%5Etfw">#CORONAVIRUS</a> <a href="https://t.co/PL9R4XvGV3">pic.twitter.com/PL9R4XvGV3</a></p>,
 'â€” Malawi Government (@MalawiGovt) ',
 <a hre

In [74]:
mystrings = []
for tag in res:
    if tag.string != None:
        mystrings.append(tag.string)
        
mystrings

['â€” Eswatini Government (@EswatiniGovern1) ',
 'March 20, 2020',
 'â€” Malawi Government (@MalawiGovt) ',
 'March 18, 2020',
 'â€” Hage G. Geingob (@hagegeingob) ',
 'March 18, 2020',
 'â€” Seychelles Ministry of Finance (@FinanceSC) ',
 'March 20, 2020',
 'â€” PresidencyZA (@PresidencyZA) ',
 'March 19, 2020',
 'â€” Ministry of Health Zambia (@mohzambia) ',
 'March 18, 2020',
 'I urge my fellow Zimbabweans to maintain excellent levels of personal hygiene. Wash your hands thoroughly with soap, cover your nose & mouth with a tissue when you cough, & avoid unnecessary travel abroad.  We must keep our nation, safe, secure & healthy.',
 'â€” President of Zimbabwe (@edmnangagwa) ',
 'March 12, 2020',
 'â€” MinSantÃ©dj (@MinSantedj) ',
 'March 20, 2020',
 'The Ministry of Health announced this evening the first confirmed case of a Coronavirus patient who arrived at Asmara International Airport from Norway with Fly Dubai at 7:00 a.m. LT this morning.  The 39-year old patient is an Eritrean 

In [85]:
# Extracting the specific twitter handles.
import re
my_regex = re.compile(r'@[a-zA-Z0-9_]{0,15}')
tags = ''.join(mystrings)
twitter_handles = my_regex.findall(tags)
twitter_handles = pd.DataFrame(twitter_handles)

twitter_handles.columns = ["Twitter_handle"]

my_list = twitter_handles["Twitter_handle"].astype(str).tolist()

my10_official_list = my_list[:10]
my10_official_list = pd.DataFrame(my10_official_list)
my10_official_list.columns = ['Twitter_handles']
my10_official_list.to_csv("10Goverment_Official_Twitter_Handles.csv", index=False)

## Web scrapping using bash script
If the web site has a quite simple HTML, you can easily use curl to perform the request and then extract the needed values using bash commands grep, cut , sed, ..

This tutorial is adapted from [this](https://medium.com/@LiliSousa/web-scraping-with-bash-690e4ee7f98d) medium article

In [64]:
%%bash 

# curl the page and save content to tmp_file
#url = "https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa"
#curl -X GET $url -o tmp_file


#!/bin/bash

# write headers to CSV file
echo "Name, twitter_id" >> extractData.csv
n="1"
while [ $n -lt 2 ]
do
  
  #get title
  title=$(cat tmp_file | grep "class=\"twitter-tweet\"" | cut -d ';' -f1 )
  echo $title
  #get author
  #twitter_id=$(cat tmp_file |grep -A1 "class=\"css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0\"" | tail -1)

  #echo "$title, $twitter_id" >> extractData.csv
  #echo "$title, $twitter_id"
    
  n=$[$n+1]

done

<blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">The Deputy Prime Minister Themba Masuku has today met representatives of the private sector and employees&#39 <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">GUIDELINES FOR SCHOOLS IN <a href="https://twitter.com/hashtag/MALAWI?src=hash&amp <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">Fellow Namibians, I declared a State of Emergency on <a href="https://twitter.com/hashtag/COVID19?src=hash&amp <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr"><a href="https://twitter.com/hashtag/COVID19measuresSC?src=hash&amp <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">The Minister for Cooperative Governance &amp <blockquote class="twitter-tweet" data-width="550" data-dnt="true"><p lang="en" dir="ltr">Join the <a href="https://twitter.com/hasht