# Daily Challenge

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Fetch the HTML content
url = 'https://github.com/topics'
response = requests.get(url)

# Check the response status
print(f"Response status code: {response.status_code}.")

Response status code: 200.


In [None]:
# Show the first 100 characters
print(f"The first 100 characters are: {response.text[:100]}.")

The first 100 characters are: 

<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-t.


In [None]:
# Save the content to a file:
with open('webpage.html', 'w', encoding='utf-8') as file:
  file.write(response.text)

# Parse the content with BeautifulSoup:
with open('webpage.html', 'r', encoding = 'utf-8') as file:
  content = file.read()

soup = BeautifulSoup(content, 'html.parser')

In [None]:
# Catch and Extract titles
title_tags = soup.find_all('p', {'class' : 'f3 lh-condensed mb-0 mt-1 Link--primary'})
titles = [tag.text.strip() for tag in title_tags]

In [None]:
print(titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command-line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'C++', 'Cryptocurrency', 'Crystal']


In [None]:
# Catch and Extract descriptions
description_tags = soup.find_all('p', {'class' : 'f5 color-fg-muted mb-0 mt-1'})
descriptions = [tag.text.strip() for tag in description_tags]

In [None]:
# Print length and content of the extracted lists
print(f"Number of extracted titles: {len(titles)}.")
print(titles)

print(f"Number of extracted descriptions: {len(descriptions)}.")
print(descriptions)

Number of extracted titles: 30.
['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command-line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'C++', 'Cryptocurrency', 'Crystal']
Number of extracted descriptions: 30.
['3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.', 'Ajax is a technique for creating interactive web applications.', 'Algorithms are self-contained sequences that carry out a variety of tasks.', 'Amp is a non-blocking concurrency library for PHP.', 'Android is an operating system built by Google designed for mobile devices.', 'Angular is an open source web application platform.', 'Ansible is a simple and powerful automation engine.', 'An API (Application Programming Interface) is a collection of protoc

In [None]:
# DataFrame creation

# Create a dictionnary
data = {
    'Title' : titles,
    'Description' : descriptions
}

# Create the DataFrame
df = pd.DataFrame(data)

In [None]:
# Print the first rows
df.head()

Unnamed: 0,Title,Description
0,3D,3D refers to the use of three-dimensional grap...
1,Ajax,Ajax is a technique for creating interactive w...
2,Algorithm,Algorithms are self-contained sequences that c...
3,Amp,Amp is a non-blocking concurrency library for ...
4,Android,Android is an operating system built by Google...


# Understanding a Javascript webpage


In [None]:
# One JavaScript playground site
# https://jsfiddle.net/

# Web scrapping javascript webpages


In [None]:
!pip install selenium
!pip install chromedriver-autoinstaller

Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m20.

## Rotten Tomatoes Scrapping

In [None]:
# Imports & Configuration
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pprint  # To tidy up


options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run Chrome in headless mode
options.add_argument("--no-sandbox")  # Bypass OS security model
options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
driver = webdriver.Chrome(options=options)

In [None]:
url = 'https://www.rottentomatoes.com/browse/movies_at_home/affiliates:netflix~critics:certified_fresh'

driver.get(url)

In [None]:
# Extract with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
movies = soup.find_all(attrs = {'data-qa' : 'discovery-media-list-item'})

for movie in movies:
  title = [item.get_text().strip() for item in soup.find_all(class_ = 'p--small')]
  score = [item.get_text() for item in soup.find_all('rt-text', {'slot' : 'criticsScore'})]
  release_date = [item.get_text().strip() for item in soup.find_all(class_ = 'smaller')]

print(f"Title: {title}")
print(f"Score: {score}")
print(f"Release date: {release_date}")


Title: ['Transformers One', 'Rob Peace', 'His Three Daughters', 'Agatha All Along: Season 1', 'Rebel Ridge', 'His Three Daughters', 'Hit Man', 'The Babadook', 'Black Mass', 'Daughters', 'Edge of Tomorrow', 'Pearl', 'His House', 'The Gentlemen', 'Leave the World Behind', 'It Follows', 'The Killer', 'Bone Tomahawk', 'Wicked Little Letters', 'The Autopsy of Jane Doe', 'Dark Waters', 'Under the Shadow', 'Aftersun', 'Hunt for the Wilderpeople', 'I Used to be Funny', 'Spider-Man: Across the Spider-Verse', 'Alone', '3:10 to Yuma', 'American Gangster', 'The Conjuring', 'The Lost Daughter', 'First Man']
Score: [' 96%', ' 97%', ' 95%', ' 98%', ' 73%', ' 100%', ' 91%', ' 93%', ' 100%', ' 75%', ' 73%', ' 95%', ' 85%', ' 91%', ' 80%', ' 86%', ' 89%', ' 99%', ' 96%', ' 97%', ' 84%', ' 95%', ' 94%', ' 89%', ' 81%', ' 86%', ' 94%', ' 87%']
Release date: ['Streaming Sep 6, 2024', 'Streaming Sep 20, 2024', 'Streaming Jun 7, 2024', 'Streaming Nov 5, 2015', 'Streaming Oct 4, 2016', 'Streaming Aug 14, 2024