# Acquire (Web Scraping)

In [1]:
# general imports
import numpy as np
import pandas as pd
import re

# for webscraping
from requests import get
from bs4 import BeautifulSoup

# local modules
import acquire as a
from env import github_token, github_username

1. Import the get() function from the requests module, BeautifulSoup from bs4, and pandas.
2. Assign the address of the web page to a variable named url.
3. Request the server the content of the web page by using get(), and store the server’s response in the variable response.
4. Print the response text to ensure you have an html page.
5. Take a look at the actual web page contents and inspect the source to understand the structure a bit.
6. Use BeautifulSoup to parse the HTML into a variable ('soup').
7. Identify the key tags you need to extract the data you are looking for.
8. Create a dataframe of the data desired.
9. Run some summary stats and inspect the data to ensure you have what you wanted.
10. Edit the data structure as needed, especially so that **one column has all the text you want included in this analysis**.
11. Create a corpus of the column with the text you want to analyze.
12. Store that corpus for use in a future notebook.

In [6]:
def get_urls(code_language):
    '''
    This function takes in a code language name as a string literal'''
    search_page = f'https://github.com/trending/{code_language}?since=daily&spoken_language_code=en'
    headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}
    response = get(search_page, headers=headers)
    # print response
    print(response)
    if response.status_code // 100 == 2:
        soup = BeautifulSoup(response.content, 'html.parser')
        urls = soup.find_all('h2')
        REPOS = ['.' + url.find('a')['href'] for url in urls if url.find('a') is not None]
        print(len(REPOS))
        print(REPOS)
        return REPOS
    else:
        print('There was a response error')
        


In [7]:
python_urls = get_urls('python')

<Response [200]>
25
['./AUTOMATIC1111/stable-diffusion-webui', './bmaltais/kohya_ss', './donnemartin/system-design-primer', './Z4nzu/hackingtool', './ChanseyIsTheBest/NX-60FPS-RES-GFX-Cheats', './BlinkDL/RWKV-LM', './neonbjb/tortoise-tts', './neuml/txtai', './TapiocaFox/Daijishou', './coqui-ai/TTS', './OpenBB-finance/OpenBBTerminal', './d8ahazard/sd_dreambooth_extension', './catppuccin/gtk', './tinyvision/SOLIDER', './raspberrypi/usbboot', './sdatkinson/NeuralAmpModelerPlugin', './Zero6992/chatGPT-discord-bot', './pittcsc/Summer2023-Internships', './elebumm/RedditVideoMakerBot', './openai/whisper', './InstaPy/InstaPy', './rawandahmad698/PyChatGPT', './Dong-learn9/TVBox-zyjk', './StevenBlack/hosts', './riffusion/riffusion']


In [4]:
a.scrape_github_data()

[{'repo': './AUTOMATIC1111/stable-diffusion-webui',
  'language': 'Python',
  'readme_contents': '# Stable Diffusion web UI\r\nA browser interface based on Gradio library for Stable Diffusion.\r\n\r\n![](screenshot.png)\r\n\r\n## Features\r\n[Detailed feature showcase with images](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features):\r\n- Original txt2img and img2img modes\r\n- One click install and run script (but you still must install python and git)\r\n- Outpainting\r\n- Inpainting\r\n- Color Sketch\r\n- Prompt Matrix\r\n- Stable Diffusion Upscale\r\n- Attention, specify parts of text that the model should pay more attention to\r\n    - a man in a `((tuxedo))` - will pay more attention to tuxedo\r\n    - a man in a `(tuxedo:1.21)` - alternative syntax\r\n    - select text and press `Ctrl+Up` or `Ctrl+Down` to automatically adjust attention to selected text (code contributed by anonymous user)\r\n- Loopback, run img2img processing multiple times\r\n- X/Y/Z plot, a 

In [5]:
a.get_repo_contents('./huggingface/text-generation-inference')

[{'name': '.dockerignore',
  'path': '.dockerignore',
  'sha': 'c69283ec552ca5f4615060c6aabcc5607d6d6962',
  'size': 54,
  'url': 'https://api.github.com/repos/huggingface/text-generation-inference/contents/.dockerignore?ref=main',
  'html_url': 'https://github.com/huggingface/text-generation-inference/blob/main/.dockerignore',
  'git_url': 'https://api.github.com/repos/huggingface/text-generation-inference/git/blobs/c69283ec552ca5f4615060c6aabcc5607d6d6962',
  'download_url': 'https://raw.githubusercontent.com/huggingface/text-generation-inference/main/.dockerignore',
  'type': 'file',
  '_links': {'self': 'https://api.github.com/repos/huggingface/text-generation-inference/contents/.dockerignore?ref=main',
   'git': 'https://api.github.com/repos/huggingface/text-generation-inference/git/blobs/c69283ec552ca5f4615060c6aabcc5607d6d6962',
   'html': 'https://github.com/huggingface/text-generation-inference/blob/main/.dockerignore'}},
 {'name': '.github',
  'path': '.github',
  'sha': '50f