<img 
    style="display: block; 
           margin-left: auto;
           margin-right: auto;
           width: 100%;"
    src="./img/The-Simple-Steps-for-Web-Scraping (1).jpg"/>

</img>)

In [1]:
# Import the necessary Python Libraries
import os
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import requests

#### 1. Get the list of topics from the topics page

In [2]:
topics_url = 'https://github.com/topics'
response = requests.get(topics_url)
# page_contents = response.text   

In [4]:
# response = requests.get(topics_url)
# # page_contents = response.text      //Uncomment this


In [None]:
def write_html_to_file_and_parse(page_contents, file_path='webpage.html'):
    """
    Write HTML content to a file and parse it using BeautifulSoup.

    Parameters:
    - page_contents (str): The HTML content to be written to the file and parsed.
    - file_path (str): The path to the HTML file (default is 'webpage.html').

    Returns:
    - BeautifulSoup object: Parsed HTML content using BeautifulSoup.

    Example:
    html_content = get_html_content_from_source()
    soup = write_html_to_file_and_parse(html_content, 'output.html')
    print(soup.title)
    """
    # Write the HTML content to a file
    with open(file_path, 'w', encoding="utf-8") as f:
        f.write(page_contents)

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(page_contents, 'html.parser')

    return soup

# Example Usage:
# html_content = get_html_content_from_source()
# soup = write_html_to_file_and_parse(html_content, 'output.html')
# print(soup.title)


## 1. Get The List Of Topics From The Topics Page

In [None]:
def get_topic_titles(soup):
    """
    Extract topic titles from a BeautifulSoup object representing a GitHub topics page.

    Parameters:
    - soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML.

    Returns:
    - list of str: A list of topic titles extracted from the HTML.

    Example:
    soup = BeautifulSoup(html_content, 'html.parser')
    titles = get_topic_titles(soup)
    print(titles)
    """
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = soup.find_all('p', {'class': selection_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

In [None]:
get_topic_titles(soup)

## 2. Get The List Of Top Repos From The Individual Topic Pages

In [None]:
def get_topic_descs(soup):
    """
        Extract topic descriptions from a BeautifulSoup object representing a GitHub topics page.
        
        Parameters:
        - soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML.
        
        Returns:
        - list of str: A list of topic descriptions extracted from the HTML.
        
        Example:
        soup = BeautifulSoup(html_content, 'html.parser')
        descriptions = get_topic_descs(soup)
        print(descriptions)
    """
    desc_selector = 'f5 color-fg-muted mb-0 mt-1'
    # soup.find('div', {"class":"stars"}) ['title']
    topic_desc_tags = soup.find_all('p', {'class': desc_selector})
    topic_descs = []
    for tag in topic_desc_tags:
        topic_descs.append(tag.text.strip())
    return topic_descs


In [None]:
get_topic_descs(soup)

## 3. For Each Topic, Create A CSV Of The Top Repos For The Topic

In [None]:
def get_topic_urls(soup):
    """
    Extract topic URLs from a BeautifulSoup object representing a GitHub topics page.

    Parameters:
    - soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML.

    Returns:
    - list of str: A list of topic URLs extracted from the HTML.

    Example:
    soup = BeautifulSoup(html_content, 'html.parser')
    urls = get_topic_urls(soup)
    print(urls)
    
    """
  
    topic_link_tags = soup.find('div', class_='col-lg-9 position-relative pr-lg-5 mb-6 mr-lg-5')

    if topic_link_tags:
        # Find the a tag within the a tag
        a_tags = topic_link_tags.find_all('a', class_="no-underline flex-grow-0")
     
        topic_urls = []
        base_url = 'https://github.com'
        for tag in a_tags:
            topic_urls.append(base_url + tag.get('href'))
    return topic_urls

In [None]:
get_topic_urls(soup)

In [None]:
def scrape_topics():
    """
    Scrape GitHub topics information from the GitHub topics page.

    Returns:
    - pandas.DataFrame: A DataFrame containing information about GitHub topics, including title, description, and URL.

    Example:
    topics_data = scrape_topics()
    print(topics_data)
    """
    # URL of the GitHub topics page
    topics_url = 'https://github.com/topics'

    # Send an HTTP GET request to the topics URL
    response = requests.get(topics_url)

    # Check if the request was successful (status code 200)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topics_url))

    # Create a BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract information about topics using helper functions
    topics_dict = {
        'Title': get_topic_titles(soup),
        'Description': get_topic_descs(soup),
        'URL': get_topic_urls(soup)
    }

    # Convert the dictionary to a pandas DataFrame
    topics_df = pd.DataFrame.from_dict(topics_dict)

    return topics_df

# Example Usage:
# topics_data = scrape_topics()
# print(topics_data)



In [None]:
scrape_topics().head()

In [None]:
def scrape_topics_repos():
    """
    Scrape information about top repositories for GitHub topics and print URLs.

    This function prints the URLs and file paths for storing data related to top repositories for GitHub topics.

    Example:
    scrape_topics_repos()
    """
    print('Scraping list of topics')

    # Call the scrape_topics function to get information about GitHub topics
    topics_df = scrape_topics()

    # Check if scrape_topics returned None
    if topics_df is None:
        print("Error: scrape_topics() returned None.")
        return

    # Create a directory named 'data' if it doesn't exist
    os.makedirs('data', exist_ok=True)

    # Iterate through the rows of the topics DataFrame
    for index, row in topics_df.iterrows():
        # Print the URL and file path for each topic
        print(row['URL'], ' data/{}.csv'.format(row['Title']))

# Example Usage:
# scrape_topics_repos()



In [None]:
scrape_topics_repos()

In [None]:
# Specify the folder path
folder_path = 'data'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

def save_topics_to_csv():
    """
    Save information about GitHub topics to a CSV file.

    This function creates a folder named 'data' if it doesn't exist, specifies the file path,
    and saves the DataFrame containing information about GitHub topics to a CSV file.

    Example:
    save_topics_to_csv()
    """
    # Specify the file path (joining the folder path and file name)
    file_path = os.path.join(folder_path, 'Top GitHub repositories.csv')

    # Call the scrape_topics function to get information about GitHub topics
    topics_df = scrape_topics()

    # Save the DataFrame to the CSV file
    topics_df.to_csv(file_path, index=False)

    print(f'DataFrame saved to {file_path}')

# Example Usage:
# save_topics_to_csv()


In [None]:
save_topics_to_csv()

## Future Undertaking

### We can modify the code to include the star ratings for individual repositories under each topic and save the data to individual CSV files. 

In [None]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup

def scrape_topics():
    """
    Scrape GitHub topics information from the GitHub topics page.

    Returns:
    - pandas.DataFrame: A DataFrame containing information about GitHub topics, including title, description, and URL.

    Example:
    topics_data = scrape_topics()
    print(topics_data)
    """
    # URL of the GitHub topics page
    topics_url = 'https://github.com/topics'

    # Send an HTTP GET request to the topics URL
    response = requests.get(topics_url)

    # Check if the request was successful (status code 200)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topics_url))

    # Create a BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract information about topics using helper functions
    topics_dict = {
        'Title': get_topic_titles(soup),
        'Description': get_topic_descs(soup),
        'URL': get_topic_urls(soup)
    }

    # Convert the dictionary to a pandas DataFrame
    topics_df = pd.DataFrame.from_dict(topics_dict)

    # Add a new column to store the individual repository data
    topics_df['Repositories'] = topics_df['URL'].apply(scrape_topic_repositories)

    return topics_df

def scrape_topic_repositories(topic_url):
    """
    Scrape information about individual repositories under a GitHub topic.

    Parameters:
    - topic_url (str): The URL of the GitHub topic.

    Returns:
    - pandas.DataFrame: A DataFrame containing information about individual repositories, including subtopic, repository name, user name, and star ratings.

    Example:
    repositories_data = scrape_topic_repositories('https://github.com/topics/python')
    print(repositories_data)
    """
    # Send an HTTP GET request to the topic URL
    response = requests.get(topic_url)

    # Check if the request was successful (status code 200)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))

    # Create a BeautifulSoup object from the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract information about individual repositories using helper functions
    repositories_dict = {
        'Subtopic': get_subtopic(soup),
        'Repository Name': get_repo_names(soup),
        'User Name': get_user_names(soup),
        'Star Ratings': get_star_ratings(soup)
    }

    # Convert the dictionary to a pandas DataFrame
    repositories_df = pd.DataFrame.from_dict(repositories_dict)

    return repositories_df

# Helper functions to extract data from individual repository pages
def get_subtopic(soup):
    # Add code to extract subtopic information
    pass

def get_repo_names(soup):
    # Add code to extract repository names
    pass

def get_user_names(soup):
    # Add code to extract user names
    pass

def get_star_ratings(soup):
    # Add code to extract star ratings
    pass

# Specify the folder path
folder_path = 'data'

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the DataFrame to a CSV file for each topic
topics_data = scrape_topics()
for index, row in topics_data.iterrows():
    # Use the topic title as the CSV file name
    file_name = row['Title'].lower().replace(' ', '_') + '_repositories.csv'
    file_path = os.path.join(folder_path, file_name)
    row['Repositories'].to_csv(file_path, index=False)
    print(f'DataFrame for {row["Title"]} saved to {file_path}')


## Feel free to edit and modify. I am here to learn.