## Daily Challenge: End-To-End Web Scraping In Python

In [94]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [95]:
# Fetch the HTML content of the chosen website
url = "https://github.com/topics"
response = requests.get(url)

In [96]:
# Print the status code of the response
print(f"Status Code: {response.status_code}")

Status Code: 200


In [97]:
# Print the first 100 characters of the HTML content
print(response.text[:100])



<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-t


In [98]:
# Save the HTML content to a file named webpage.html
with open("webpage.html", "w", encoding="utf-8") as file:
    file.write(response.text)

In [99]:
# Use BeautifulSoup to parse the saved HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [118]:
# Extract information from the webpage
selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
topic_title_tags = soup.find_all('p',{'class': selection_class})
topic_titles = [title.text.strip() for title in topic_title_tags]

desc = "f5 color-fg-muted mb-0 mt-1"
descriptions=soup.find_all('p',{'class':desc})
desc = [desc.text.strip() for desc in descriptions]

In [119]:
# Print the length and content of each extracted list to verify the extraction process.
print(f"Number of Titles: {len(topic_titles)}")
print("Titles:", topic_titles)

print(f"\nNumber of Descriptions: {len(desc)}")
print("Descriptions:", desc)

Number of Titles: 30
Titles: ['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']

Number of Descriptions: 30
Descriptions: ['3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.', 'Ajax is a technique for creating interactive web applications.', 'Algorithms are self-contained sequences that carry out a variety of tasks.', 'Amp is a non-blocking concurrency library for PHP.', 'Android is an operating system built by Google designed for mobile devices.', 'Angular is an open source web application platform.', 'Ansible is a simple and powerful automation engine.', 'An API (Application Programming Interface) is a collection of protocols and 

In [124]:
# Create a Python dictionary to structure the extracted data
data = {'title': topic_titles, 'description': desc}

In [125]:
# Convert the dictionary into a pandas DataFrame
df = pd.DataFrame(data)

In [126]:
# Print the DataFrame to confirm its structure and contents
df

Unnamed: 0,title,description
0,3D,3D refers to the use of three-dimensional grap...
1,Ajax,Ajax is a technique for creating interactive w...
2,Algorithm,Algorithms are self-contained sequences that c...
3,Amp,Amp is a non-blocking concurrency library for ...
4,Android,Android is an operating system built by Google...
5,Angular,Angular is an open source web application plat...
6,Ansible,Ansible is a simple and powerful automation en...
7,API,An API (Application Programming Interface) is ...
8,Arduino,Arduino is an open source platform for buildin...
9,ASP.NET,ASP.NET is a web framework for building modern...
