# Acquire (Web Scraping)

In [1]:
# general imports
import numpy as np
import pandas as pd
import re
import itertools

# for webscraping
from requests import get
from bs4 import BeautifulSoup
import time

# local modules
import acquire as a
from env import github_token, github_username

1. Import the get() function from the requests module, BeautifulSoup from bs4, and pandas.
2. Assign the address of the web page to a variable named url.
3. Request the server the content of the web page by using get(), and store the server’s response in the variable response.
4. Print the response text to ensure you have an html page.
5. Take a look at the actual web page contents and inspect the source to understand the structure a bit.
6. Use BeautifulSoup to parse the HTML into a variable ('soup').
7. Identify the key tags you need to extract the data you are looking for.
8. Create a dataframe of the data desired.
9. Run some summary stats and inspect the data to ensure you have what you wanted.
10. Edit the data structure as needed, especially so that **one column has all the text you want included in this analysis**.
11. Create a corpus of the column with the text you want to analyze.
12. Store that corpus for use in a future notebook.

In [2]:
def get_urls():
    '''
    This function queries github and returns a list of the
    trending repository for the day, week, and month for each of
    the following languages: python, javascript, rust
    
    Arguments: None
    
    Returns: A python list of urls of repositories formatted for github rest api calls.
    '''
    # establish an empty list for repo urls
    REPOS = []
    # establish the programming languages to be queried
    languages = ['python', 'javascript', 'rust']
    # establish the periods to be queried
    periods = ['daily', 'weekly', 'monthly']
    # generate a list of tuples of all the combinations of language and period
    combinations = list(itertools.product(languages, periods))
    # create list of base_page urls 
    base_pages = [
        f'https://github.com/trending/{combination[0]}?since={combination[1]}&spoken_language_code=en'
        for combination in combinations]
    headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}
    # request daily trending repos
    for page in base_pages:
        response = get(page, headers=headers)
        # print response
        print(response)
        # if response code is 2xx then parse with beautiful soup and add hrefs of repos to REPOS
        if response.status_code // 100 == 2:
            soup = BeautifulSoup(response.content, 'html.parser')
            urls = soup.find_all('h2')
            [REPOS.append('.' + url.find('a')['href']) for url in urls if url.find('a') is not None]
            print(len(REPOS))
            print(REPOS)
            time.sleep(15)  # pause for 15 seconds between requests
        # if response code is not 2xx then print 'there was a response error'
        else:
            print('There was a response error')
    return list(set(REPOS))
        


In [3]:
# urls = get_urls()

In [4]:
# len(urls)

In [5]:
# print(urls)

In [6]:
df = pd.DataFrame(a.scrape_github_data())

In [9]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,./WebGoat/WebGoat,JavaScript,# WebGoat 8: A deliberately insecure Web Appli...
1,./vercel/next-react-server-components,JavaScript,# Next.js App Router + React Server Components...
2,./LaurentMazare/tch-rs,Rust,# tch-rs\nRust bindings for the C++ api of PyT...
3,./ultralytics/yolov5,Python,"<div align=""center"">\n <p>\n <a align=""cen..."
4,./FastForwardTeam/FastForward,JavaScript,"<div align=""center"">\n<img src=""https://avatar..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             164 non-null    object
 1   language         164 non-null    object
 2   readme_contents  164 non-null    object
dtypes: object(3)
memory usage: 4.0+ KB


In [11]:
df.to_csv('repos.csv')

In [12]:
df.language.value_counts()

Python        58
JavaScript    56
Rust          50
Name: language, dtype: int64