In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
import pandas as pd
import bs4
import time

from env import github_token, github_username

import acquire
import wrangle

In [None]:
contents = acquire.scrape_github_data()


In [None]:
contents = pd.DataFrame(contents)

In [None]:
contents.dtypes

In [None]:
contents.shape

In [None]:
contents.to_csv('f1_readmes.csv', index=False)

In [None]:
contents.readme_contents[1]

In [None]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    # set headers and response variables
    headers = {'User-Agent': 'Codeup Data Science'} 
    response = requests.get(url, headers=headers)
    # use BeartifulSoup to make object
    html = response.text
    soup = bs4.BeautifulSoup(html)
    return soup


def github_urls_single_page():
    '''
    This function scrapes all of the evironmental urls from
    the github first search page and returns a list of urls.
    '''
    # The base url for the main github search page we are using
    url = 'https://github.com/search?q=formula+1&type=repositories'
    
    # Make request and soup object using helper
    soup = make_soup(url)
    
    # Create a list of the anchor elements that hold the urls.
    urls_list = soup.find_all('a', class_='v-align-middle')
    # for each url in the find all list get just the 'href' link
    urls = {link.get('href') for link in urls_list}
    # make a list of these urls
    urls = list(urls)
    return urls

In [None]:
def github_urls():
    '''
    This function scrapes all of the Formula 1 urls from
    the github search page and returns a list of urls.
    '''
    # get the first 50 pages to allow for those that don't have readme or language
    pages = range(1, 50)
    urls = []
    
    for p in pages:
        
        # format string of the base url for the main github search page we are using to update with page number
        url = f'https://github.com/search?p={p}&q=formula+1&s=stars&type=Repositories'

        # Make request and soup object using helper
        soup = make_soup(url)

        # Create a list of the anchor elements that hold the urls on this search page
        page_urls_list = soup.find_all('a', class_='v-align-middle')
        # for each url in the find all list get just the 'href' link
        page_urls = {link.get('href') for link in page_urls_list}
        # make a list of these urls
        page_urls = list(page_urls)
        # append the list from the page to the full list to return
        urls.append(page_urls)
        time.sleep(5)
        
    # flatten the urls list
    urls = [y for x in urls for y in x]
    return urls

In [None]:
urls = github_urls()

In [None]:
len(urls)


In [None]:
urls

In [2]:
df = wrangle.first_clean()

In [3]:
df.head()

Unnamed: 0,repo,language,readme_contents,written_language
0,ppatierno/formula1-telemetry-kafka,Java,# Formula 1 - Telemetry with Apache Kafka\n\nT...,en
2,jcnewell/ergast-f1-api,PHP,# Ergast nodeJS API\nIn this period we are wor...,en
5,izenecloud/sf1r-lite,C++,SF1R-Lite(Search Formula-1 Lite Engine)\n=====...,en
7,jonybur/f1-telemetry-client,TypeScript,"# F1 Telemetry Client\n\n<img src=""https://img...",en
8,davidor/formula1-lap-charts,JavaScript,# Formula 1 Lap Charts\n\nInteractive lap char...,en


In [4]:
df.written_language.value_counts()

en    163
pt     11
fr      6
es      6
Name: written_language, dtype: int64

In [5]:
df.language.value_counts()

JavaScript          50
Python              27
HTML                20
C++                 14
Java                13
PHP                 13
TypeScript          12
Ruby                11
C#                  10
Jupyter Notebook     9
R                    7
Name: language, dtype: int64