## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

> Import libraries

In [3]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import re
# import fire

from dotenv import load_dotenv
load_dotenv()

True

### Gather Data

In [4]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('finding',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

> Scrape data from [africafreak.com](https://africafreak.com/100-most-influential-twitter-users-in-africa)

In [7]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa', tag='h2')

In [8]:
non_govt_influencers = res

In [9]:
non_govt_influencers = pd.Series(non_govt_influencers)

In [10]:
afriq_users_handle = [i.split('(')[-1].strip(')') for i in non_govt_influencers]
afriq_users_handle=afriq_users_handle[:100]

> correct incorrect tweets

In [44]:
afriq_users_handle[12] = '@beyondsafari'

> parse to DataFrame

In [39]:
df_afriq_users_handle = pd.DataFrame(afriq_users_handle, columns=['handles'])

In [40]:
df_afriq_users_handle

Unnamed: 0,handles
0,@gettleman
1,@a24media
2,@andiMakinana
3,@AfricaCheck
4,@JamesCopnall
...,...
95,@Julius_S_Malema
96,@News24
97,@SAPresident
98,@GarethCliff


In [14]:
df_afriq_users_handle.to_csv('scraped_handles/top_100_influencers.csv')

> Data for Interim Submision

In [171]:
tenx_afriq_users = df_afriq_users_handle[:10]

In [173]:
tenx_afriq_users.to_csv('interim_submission/ten_Influencers.csv')

> Scrape data from [atlanticcouncil.org](https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa)

In [137]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = get(url).content
res = get_elements(response, tag='blockquote')
res[:2]

["The Deputy Prime Minister Themba Masuku has today met representatives of the private sector and employees' unions to map a collaborative effort in the fight against #COVID19. pic.twitter.com/EIYNGOEKRN— Eswatini Government (@EswatiniGovern1) March 20, 2020",
 'GUIDELINES FOR SCHOOLS IN #MALAWI ON THE PREVENTION AND MANAGEMENT OF #COVID19 #CORONAVIRUS pic.twitter.com/PL9R4XvGV3— Malawi Government (@MalawiGovt) March 18, 2020']

In [138]:
afriq_govt = []
afriq_govt_handle = []
for r in res:
    split_data = r.split('— ',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].rsplit(')',maxsplit=1)[0]
    user = str(name), str(handle)
    afriq_govt.append(user)
    afriq_govt_handle.append(handle)

In [139]:
res_ = simple_get(url)
res = get_elements(res_, search={'find_all':{'class_':'wp-block-embed__wrapper'}})

findaing all of {'class_': 'wp-block-embed__wrapper'}


In [140]:
x= pd.DataFrame({'names':res})
x['names'] = x[x['names'].apply(lambda x: "twitter.com" in x)]
x.dropna(inplace=True)
links = x.names.values

In [141]:
for link in links:
    name = link.split('/')[3]
    handle = '@'+name
    user= str(name), str(handle)
    afriq_govt.append(user)
    afriq_govt_handle.append(handle)

In [142]:
afriq_govt_handle[:1]

['@EswatiniGovern1']

In [143]:
df_afriq_govt_handle = pd.DataFrame(afriq_govt_handle, columns=['handles'])

In [144]:
df_afriq_govt_handle.to_csv('scraped_handles/africa_govt_covid_resp.csv')

> Interim Submision

In [174]:
tenx_afriq_govt = df_afriq_govt_handle[:10]

In [175]:
tenx_afriq_govt.to_csv('interim_submission/ten_africaGovt.csv')

#### Get Data From Twitter

> Importing libraies & preparing api-keys

In [145]:
import tweepy
from tweepy import OAuthHandler
from tweepy import API
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
from collections import Counter
import sys
import csv

In [146]:
API_key="API_key"
API_secret_key="API_secret_key"
Access_token="Access_token"
Access_token_secret="Access_token_secret"
print(API_key, API_secret_key, Access_token, Access_token_secret)

API_key API_secret_key Access_token Access_token_secret


In [147]:
API_key = os.environ.get(API_key)
API_secret_key = os.environ.get(API_secret_key)
Access_token = os.environ.get(Access_token)
Access_token_secret=os.environ.get(Access_token_secret)

In [148]:
auth = OAuthHandler(API_key, API_secret_key)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
auth_api = API(auth)

> Testing Api

In [149]:
search_words = "#wildfires"
date_since= "2018-11-16"

In [150]:
# Collect tweets
tweets = tweepy.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(2)
# Iterate and print tweets
for tweet in tweets:
    print(tweet.text)

RT @ReubenKSEE24: #NOW Smoke plume from #MineralFire on our visible satellite image this midday. @KSEE24 #cafires #wildfires https://t.co/R…
RT @AmplitudeUSA: Heading back out to make a drop....
#Colorado #wildfires #Meeker https://t.co/5zAKKVSfIS


> influential African Twitter Data

>> Define functions

In [151]:
def get_tweets(handles):
    
    cols = ['id', 'name', 'screen_name', 'description', 
            'statuses_count', 'friends_count', 'followers_count', 
            'account_age_days', 'avg_daily_tweets', 'hashtags',
            'user_mentions','favorite_count', 'retweet_count',]
    
    # dataframe that would be returned at the end
    df = pd.DataFrame(columns=cols)
    #print(df)
    handle_data = []
    off_users = []
            
    if len(handles) > 0: 
        for handle in handles:
            value_list = []
            print("Getting data for " + handle)
            # this helps avoid Tweepy errors like suspended users or user not ound errors
            try:
                item = auth_api.get_user(handle)
            except tweepy.TweepError as e:
                continue
            value_list+= item.id_str, item.name, item.screen_name,\
            item.description, item.statuses_count, item.friends_count, item.followers_count
            
            #get average daily tweets
            
            no_tweets = item.statuses_count
            account_created_date = item.created_at
            delta = datetime.utcnow() - account_created_date
            account_age_days = delta.days
            value_list.append(str(account_age_days))
            #print(str(account_age_days))
            if account_age_days > 0:
                   value_list.append(int(float(no_tweets)/float(account_age_days)))
                    
                    
                    
            hashtags = []
            mentions = []
            favorite_count =[]
            retweet_count=[]
            tweet_count = 0
            end_date = datetime.utcnow() - timedelta(days=30)
            

            for status in Cursor(auth_api.user_timeline, id=handle).items():
                tweet_count+= 1
                if hasattr(status, "entities"):
                    entities = status.entities

                # get hashtags
                if "hashtags" in entities:
                    for ent in entities["hashtags"]:
                        if ent is not None:
                            if "text" in ent:
                                hashtag = ent["text"]
                                if hashtag is not None:
                                    hashtags.append(hashtag)
                # get usermentions
                if "user_mentions" in entities:
                    for ent in entities["user_mentions"]:
                        if ent is not None:
                            if "screen_name" in ent:
                                name = ent["screen_name"]
                                if name is not None:
                                    mentions.append(name)

                # get retweets    
                if hasattr(status, "retweet_count"):
                    retweets = status.retweet_count
                    if retweets is not None:
                        retweet_count.append(retweets)
                        
                # favorite count     
                if hasattr(status, "favorite_count"):
                    likes = status.favorite_count 
                    if likes is not None:
                        favorite_count.append(likes)
                if status.created_at < end_date:
                    break
                    
            
            value_list.append(len(hashtags))
            value_list.append(len(mentions))
            value_list.append(sum(favorite_count))
            value_list.append(sum(retweet_count))
            handle_data.append(value_list)
            #print(handle_data)
            #break
    #ls = {}
            #df_1 = pd.DataFrame([handle_data], columns=cols)
            #print(handle_data)
            #complete_df = pd.concat(df, df_1)
            df = df.append(pd.DataFrame([value_list], columns=cols))
            #print(df)
    return df

In [128]:
# if len(account_list) > 0:
#   for target in account_list:
#     print("Getting data for " + target)
#     item = auth_api.get_user(target)
#     print("name: " + item.name)
#     print("screen_name: " + item.screen_name)
#     print("description: " + item.description)
#     print("statuses_count: " + str(item.statuses_count))
#     print("friends_count: " + str(item.friends_count))
#     print("followers_count: " + str(item.followers_count))
    
    
# tweets = item.statuses_count
# account_created_date = item.created_at
# delta = datetime.utcnow() - account_created_date
# account_age_days = delta.days
# print("Account age (in days): " + str(account_age_days))
# if account_age_days > 0:
# print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
    
# hashtags = []
# mentions = []
# favorite_count =[]
# retweet_count=[]
# tweet_count = 0
# end_date = datetime.utcnow() - timedelta(days=30)

#     for status in Cursor(auth_api.user_timeline, id=target).items():
#         tweet_count += 1
#         if hasattr(status, "entities"):
#         entities = status.entities
        
#         # get hashtags
#         if "hashtags" in entities:
#             for ent in entities["hashtags"]:
#             if ent is not None:
#                 if "text" in ent:
#                     hashtag = ent["text"]
#                 if hashtag is not None:
#                     hashtags.append(hashtag)
#         value_list+=len(hashtags)
#         # get usermentions
#         if "user_mentions" in entities:
#             for ent in entities["user_mentions"]:
#                 if ent is not None:
#                     if "screen_name" in ent:
#                         name = ent["screen_name"]
#                         if name is not None:
#                             mentions.append(name)
#         value_list+=len(mentions)
                                              
#         # get retweets    
#         if hasattr(status, "retweet_count"):
#             retweets = status.retweet_count
#             if retweets is not None:
#                 retweet_count.append(retweets)
#         value_list+=sum(retweet_count)

#         # favorite count     
#         if hasattr(status, "favorite_count"):
#             likes = status.favorite_count 
#             if likes is not None:
#                 favorite_count.append(likes)
#         value_list+=sum(retweet_count)
#         if status.created_at < end_date:
#         break

In [161]:
df_inf = get_tweets(afriq_users_handle)

Getting data for @gettleman
Getting data for @a24media
Getting data for @andiMakinana
Getting data for @AfricaCheck
Getting data for @JamesCopnall
Getting data for @oafrica
Getting data for @PatrickNgowi
Getting data for @StateAfrica
Getting data for @Moadow
Getting data for @BrendanSAfrica
Getting data for @CityTshwane
Getting data for @VISI_Mag
Getting data for @beyondsafari
Getting data for @ThisIsAfricaTIA
Getting data for @sarzss
Getting data for @TheEIU_Africa
Getting data for @InvestInAfrica
Getting data for @malonebarry
Getting data for @artsouthafrica
Getting data for @KahnMorbee
Getting data for @JamalMOsman
Getting data for @iamsuede
Getting data for @mikestopforth
Getting data for @equal_education
Getting data for @t_mcconnell
Getting data for @forbeesta
Getting data for @hurricanevaness
Getting data for @BBCKarenAllen
Getting data for @jaxpanik
Getting data for @thisisafrica
Getting data for @audisouthafrica
Getting data for @ONEinAfrica
Getting data for @Hamza_Africa
Gett

In [162]:
df_inf

Unnamed: 0,id,name,screen_name,description,statuses_count,friends_count,followers_count,account_age_days,avg_daily_tweets,hashtags,user_mentions,favorite_count,retweet_count
0,305125998,Jeffrey Gettleman,gettleman,South Asia bureau chief for the New York Times...,3769,37,25691,3340,1,5,39,262,111589
0,26475943,A24 Media,a24media,Africa 24 produces compelling content that mak...,16858,3059,31287,4131,4,28,45,105,180
0,72013267,Scapegoat,AndiMakinana,In pursuit of scoops. I do not write headlines...,142218,2840,101226,3966,35,38,628,19132,395510
0,625489039,Africa Check,AfricaCheck,Africa's first independent fact-checking websi...,27310,4590,68047,2935,9,152,210,1535,1491
0,401520924,James Copnall,JamesCopnall,BBC reporter + presenter. Author A Poisonous T...,19434,5045,21960,3182,6,21,83,139,47609
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,117102398,Julius Sello Malema,Julius_S_Malema,Commander in Chief of Economic Freedom Fighter...,37190,652,3125484,3795,9,75,549,175191,292263
0,14697575,News24,News24,South Africa's premier online news resource. F...,322578,632,3574869,4452,72,228,1015,206020,91137
0,1102508781781557248,jdwtweet,SAPresident,,19,14,18,500,0,0,1,0,38
0,17962204,Gareth Cliff,GarethCliff,President of https://t.co/scMZ7lsVKF ⚜. Enquir...,31623,356,1974435,4238,7,41,165,4217,1791


> Save Gathered Twitter Data of Top African Influencers to csv file

In [166]:
df_inf.to_csv('twitter_datasets/afriqTopInfluencers,csv', sep=',')

> .

In [152]:
df_gov = get_tweets(afriq_govt_handle)

Getting data for @EswatiniGovern1
Getting data for @MalawiGovt
Getting data for @hagegeingob
Getting data for @FinanceSC
Getting data for @PresidencyZA
Getting data for @mohzambia
Getting data for @edmnangagwa
Getting data for @MinSantedj
Getting data for @hawelti
Getting data for @StateHouseKenya
Getting data for @PaulKagame
Getting data for @M_Farmaajo
Getting data for @SouthSudanGov
Getting data for @SudanPMHamdok
Getting data for @TZSpokesperson
Getting data for @KagutaMuseveni
Getting data for @angola_Mirex
Getting data for @willynyamitwe
Getting data for @Cherif_MZ
Getting data for @Presidence_RDC
Getting data for @PresidentABO
Getting data for @PresidenceBenin
Getting data for @rochkaborepf
Getting data for @PresidenciaCV
Getting data for @AOuattara_PRCI
Getting data for @Presidency_GMB
Getting data for @NAkufoAddo
Getting data for @President_GN
Getting data for @USEmbalo
Getting data for @PresidenceMali
Getting data for @CheikhGhazouani
Getting data for @IssoufouMhm
Getting dat

In [154]:
df_gov

Unnamed: 0,id,name,screen_name,description,statuses_count,friends_count,followers_count,account_age_days,avg_daily_tweets,hashtags,user_mentions,favorite_count,retweet_count
0,1181832897508065280,Eswatini Government,EswatiniGovern1,This is the official twitter account of the Go...,1672,82,11270,281,5,78,13,5587,1652
0,2189970192,Malawi Government,MalawiGovt,#Malawi Government Twitter Feed | https://t.co...,4030,26,39140,2438,1,85,49,8381,4492
0,2515899612,Hage G. Geingob,hagegeingob,President of the Republic of Namibia,1085,55,192079,2247,0,3,1,13062,1635
0,1239872615248015361,Seychelles Ministry of Finance,FinanceSC,"Ministry of Finance, Trade, Investment and Eco...",124,224,125,121,1,0,0,10,1
0,40839292,Presidency | South Africa 🇿🇦,PresidencyZA,This is the official Twitter page of The Presi...,18845,14,1597241,4077,4,81,129,25170,12086
0,1200316338,Ministry of Health Zambia,mohzambia,The Ministry aims to address and share ideas w...,835,95,7114,2703,0,6,2,97,63
0,447895686,President of Zimbabwe,edmnangagwa,Official Twitter account of Emmerson Dambudzo ...,628,116,545317,3124,0,1,3,25099,6046
0,894266976499060736,MinSantédj,MinSantedj,ORGANISME GOUVERNEMENTAL\nSuivez toutes les ac...,1062,127,2924,1075,0,7,16,218,245
0,438370063,Yemane G. Meskel,hawelti,Minister of Information,4710,434,65970,3135,1,41,8,9903,4117
0,364830542,State House Kenya,StateHouseKenya,,9044,214,1102229,3243,2,8,9,25356,6436


> Save Gathered Tweetn Data of Africa Governmentleaders to CSV File

In [157]:
df_gov.to_csv('twitter_datasets/afriqGovCovid19Resp.csv', sep=',')

> Gather Twitter Hashtags of Influencers and Africa Govt. Covid19 Responders

In [167]:
def get_hashtags(handles):
    
    cols = ['id', 'name', 'screen_name', 'hashtags',' hashtag_counts']
    
    # dataframe that would be returned at the end
    df = pd.DataFrame(columns=cols)
    #print(df)
    handle_data = []
            
    if len(handles) > 0: 
        for handle in handles:
            value_list = {}
            print("Getting data for " + handle)
            # this helps avoid Tweepy errors like suspended users or user not ound errors
            try:
                item = auth_api.get_user(handle)
            except tweepy.TweepError as e:
                continue
            #value_list+= item.id_str, item.name, item.screen_name,
            value_list['id'] = item.id_str
            value_list['name'] = item.name
            value_list['screen_name'] = item.screen_name
            value_list['hashtags'] = []
            
            #get average daily tweets
            no_tweets = item.statuses_count
            account_created_date = item.created_at
            delta = datetime.utcnow() - account_created_date
            account_age_days = delta.days
           
            hashtags = []
            tweet_count = 0
            end_date = datetime.utcnow() - timedelta(days=30)
            

            for status in Cursor(auth_api.user_timeline, id=handle).items():
                tweet_count+= 1
                if hasattr(status, "entities"):
                    entities = status.entities

                # get hashtags
                if "hashtags" in entities:
                    for ent in entities["hashtags"]:
                        if ent is not None:
                            if "text" in ent:
                                hashtag = ent["text"]
                                if hashtag is not None:
                                    hashtags.append(hashtag)
                value_list['hashtags'].append(hashtags)
                value_list['hashtag_counts'] = len(hashtags)
        
        df = df.append(pd.DataFrame(value_list))

In [169]:
# df_gov_hashtags = get_hashtags(afriq_govt_handle)