# Webscraping using python

#### Collecting data from available source on the internet
#### Currently I am able to extract information as required from APIs or websites and store the data in different forms like Excel sheets, CSV format or into a database.

### I am a fan of SpaceX and its great Accomplishments, You will find other two files in the same directory of the file in my gitHub repository containing How to collect data about this company.
#### For an example of API data collection check 'spacex-data-collection-api.ipynb' NoteBook 
#### For an example of Webscraping check 'spacex-webscraping.ipynb' Notebokk

## More Examples

### Extracting data from APIs of forms XML and JSON

##### Using Geo API and extract the required information dealing with XML format


In [2]:
import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
import ssl

api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
# https://developers.google.com/maps/documentation/geocoding/intro

if api_key is False:
    api_key = 42
    serviceurl = 'http://py4e-data.dr-chuck.net/xml?'
else :
    serviceurl = 'https://maps.googleapis.com/maps/api/geocode/xml?'

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:
    address = input('Enter location: ')
    if len(address) < 1: break

    parms = dict()
    parms['address'] = address
    if api_key is not False: parms['key'] = api_key
    url = serviceurl + urllib.parse.urlencode(parms)
    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)

    data = uh.read()
    print('Retrieved', len(data), 'characters')
    print(data.decode())
    tree = ET.fromstring(data)

    results = tree.findall('result')
    lat = results[0].find('geometry').find('location').find('lat').text
    lng = results[0].find('geometry').find('location').find('lng').text
    location = results[0].find('formatted_address').text

    print('lat', lat, 'lng', lng)
    print(location)

Retrieving http://py4e-data.dr-chuck.net/xml?address=South+Federal+University&key=42
Retrieved 2311 characters
<?xml version="1.0" encoding="UTF-8"?>
<GeocodeResponse>
 <status>OK</status>
 <result>
  <type>establishment</type>
  <type>point_of_interest</type>
  <type>university</type>
  <formatted_address>ул, Bol'shaya Sadovaya Ulitsa, 105/42, Rostov, Rostovskaya oblast', Russia, 344006</formatted_address>
  <address_component>
   <long_name>105/42</long_name>
   <short_name>105/42</short_name>
   <type>street_number</type>
  </address_component>
  <address_component>
   <long_name>Bol'shaya Sadovaya Ulitsa</long_name>
   <short_name>Bol'shaya Sadovaya Ulitsa</short_name>
   <type>route</type>
  </address_component>
  <address_component>
   <long_name>Rostov</long_name>
   <short_name>Rostov</short_name>
   <type>locality</type>
   <type>political</type>
  </address_component>
  <address_component>
   <long_name>Kirovskiy</long_name>
   <short_name>Kirovskiy</short_name>
   <type>admi

##### Using Geo API and extract the required information dealing with JSON format

In [3]:
import urllib.request, urllib.parse, urllib.error
import json
import ssl

api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
# https://developers.google.com/maps/documentation/geocoding/intro

if api_key is False:
    api_key = 42
    serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
    serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:
    address = input('Enter location: ')
    if len(address) < 1: break

    parms = dict()
    parms['address'] = address
    if api_key is not False: parms['key'] = api_key
    url = serviceurl + urllib.parse.urlencode(parms)

    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)
    data = uh.read().decode()
    print('Retrieved', len(data), 'characters')

    try:
        js = json.loads(data)
    except:
        js = None

    if not js or 'status' not in js or js['status'] != 'OK':
        print('==== Failure To Retrieve ====')
        print(data)
        continue

    print(json.dumps(js, indent=4))

    lat = js['results'][0]['geometry']['location']['lat']
    lng = js['results'][0]['geometry']['location']['lng']
    print('lat', lat, 'lng', lng)
    location = js['results'][0]['formatted_address']
    print(location)

Retrieving http://py4e-data.dr-chuck.net/json?address=South+Federal+University&key=42
Retrieved 2443 characters
{
    "results": [
        {
            "address_components": [
                {
                    "long_name": "105/42",
                    "short_name": "105/42",
                    "types": [
                        "street_number"
                    ]
                },
                {
                    "long_name": "Bol'shaya Sadovaya Ulitsa",
                    "short_name": "Bol'shaya Sadovaya Ulitsa",
                    "types": [
                        "route"
                    ]
                },
                {
                    "long_name": "Rostov",
                    "short_name": "Rostov",
                    "types": [
                        "locality",
                        "political"
                    ]
                },
                {
                    "long_name": "Kirovskiy",
                    "short_name": "Kirovskiy",

### Webscraping with BeautifulSoup for scraping HTML webpages and extract information 

In [6]:
# To run this, download the BeautifulSoup zip file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('https://github.com/Binbasri-in/DataScience')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    print(tag.get('href', None))

#start-of-content
https://github.com/
/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E%2F%3Crepo-name%3E&source=header-repo
/features
/mobile
/features/actions
/features/codespaces
/features/packages
/features/security
/features/code-review
/features/issues
/features/integrations
/sponsors
/customer-stories
/team
/enterprise
/explore
/topics
/collections
/trending
https://lab.github.com/
https://opensource.guide
/readme
/events
https://github.community
https://education.github.com
https://stars.github.com
/marketplace
/pricing
/pricing#compare-features
https://github.com/enterprise/contact
https://education.github.com




/login?return_to=https%3A%2F%2Fgithub.com%2FBinbasri-in%2FDataScience
/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E%2F%3Crepo-name%3E&source=header-repo&source_repo=Binbasri-in%2FDataScience
/Binbasri-in
/Binbasri-in/DataScience
/login?return_to=%2FBinbasri-in%2FDataScience
/login?return_to=%2FBinbasri-in%2F