# Web Scraping

Request Library

In [None]:
import requests

response = requests.get('https://api.github.com/events')
print(response)
print(type(response))

In [None]:
print(response.status_code)
# 200 means success
# 404 means not found
# 500 means server error
# 403 means forbidden not allowed

if response.status_code == 200:
    print("Success")
else:
    print("Error occurred")

In [None]:
print(response.headers)
print(response.headers['Content-Type'])
print(response.text) # return JSON array

# content eka json ekak widiyata return karannawa nam dictnory ekak widiyata ganna puluwan
# dictionary ekak widiyata anne thani json ekak thiyenawanam
# mekedi json array ekak thiyena nisa list ekak widiyata gannawa
print(response.json())
print(type(response.json()))  # list
print(response.json()[0]['type'])

In [None]:
# if response.headers['Content-Type'] == 'application/json'
# meka use karana wita charset=utf-8 da athulath wiya yuthuya. content type ekedi akath return wenawa.

if 'application/json' in response.headers.get('Content-Type', ''):
    data = response.json()
    print("JSON data received")
    print(data)

In [None]:
import requests

response = requests.get('https://api.github.com/events')
print(response)

if response.status_code == 200:
    print(response.text[:100])
else:
    print("Failed to retrieve data" ,response.status_code)

In [None]:
# POST

import requests

# data to be sent in the from
data = {'username': 'user', 'password': 'pass'}
# URL where the form should be submitted
response = requests.post('https://httpbin.org/post', data)

if response.status_code == 200:
    print("Form submitted successfully")
    print(response.text)
else:
    print("Failed to submit form", response.status_code)

URL Parameters

In [None]:
import requests

url = "https://jsonplaceholder.typicode.com/posts"
params = {'userId': '1', 'id': 2}
response = requests.get(url, params=params)

print("Requested URL:", response.url)
print(response.text)

Basic Authentication

In [None]:
import requests

url = 'https://httpbin.org/get'
username = 'user'
password = 'pass'

response = requests.get(url, auth=(username, password))
# Check if the request was successful

if response.status_code == 200:
    print("Authenticated successfully")
    print(response.text)
else:
    print("Authentication failed", response.status_code)

Beautiful Soup Library

In [None]:
from bs4 import BeautifulSoup

html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie; class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie; class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie; class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

In [None]:
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.title)

print(soup.a)
print(soup.p)
print(soup.find_all('a'))
# print(soup.find)

Q. Fetch a webpage, check if the request was successful, and extract the main heading text from the page's content. You can assume that the base url is 'http://example.com'.

In [None]:
import requests

url = 'http://example.com'
response = requests.get(url)

if response.status_code == 200:
    print("Request was successful")
    # print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    h1_tag = soup.find('h1')

    if h1_tag:
        print(h1_tag.text)
    else:
        print("No <h1> tag found")
else:
    print("Request failed: ", response.status_code)

# API Integration

In [22]:
import requests

url = 'https://api.open-meteo.com/v1/forecast'

params = {
    "latitude": 6.0375,
    "longitude": 80.2076,
    "current": "temperature_2m,windspeed_10m",
}

response = requests.get(url, params=params)
if response.status_code == 200:
    print("Weather data retrieved successfully")
    print(response.headers.get('Content-Type'))
    weather_data = response.json()

    if weather_data and 'current' in weather_data:
        print(f"temperature_2m: {weather_data['current']['temperature_2m']}")
        print(f"windspeed_10m: {weather_data['current']['windspeed_10m']}")
    else:
        print("No current weather data found")
        
else:
    print("Failed to retrieve data", response.status_code)


Weather data retrieved successfully
application/json; charset=utf-8
temperature_2m: 27.1
windspeed_10m: 10.9


# Python Error Type

* Syntax error
* Runtime error
* Loical error

# Python Debugger (pdb)

In [None]:
import pdb

print("Starting")
x = 5
pdb.set_trace()
y = 0

z = x / y
print("Finished")

Starting
> [32mc:\users\shehan\appdata\local\temp\ipykernel_2748\13252079.py[39m([92m5[39m)[36m<module>[39m[34m()[39m

