## Plan for acquire:

1. Pull most forked repos. There are over 4 million forked repos, so we can easily get as many repos as we need. 

In [55]:
import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Space for importing acquire/prep functions that I've saved.

# Acquire

In [None]:
# First off, going to try and grab the html from a single repo; then I'll try and go it for a full page of results.

In [44]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Sir Galahad'} 
    response = get(url, headers=headers)    
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [60]:
def get_language_urls():
    '''
    This function scrapes all of the Codeup blog urls from
    the main Codeup blog page and returns a list of urls.
    '''
    
    urls = []
    
    languages = ['JavaScript', 'Python']
    
    for language in languages:
        for i in range(1,40):
            # first page for most starred repos on GH
            url = f'https://github.com/search?l={language}&p={i}&q=stars%3A%3E0&s=stars&type=Repositories'

            urls.append(url)
    return urls

In [68]:
def get_single_language_urls(lang_choice):
    '''
    This function scrapes all of the Codeup blog urls from
    the main Codeup blog page and returns a list of urls.
    '''
    
    urls = []
    
    languages = [f'{lang_choice}']
    
    for language in languages:
        for i in range(1,40):
            # first page for most starred repos on GH
            url = f'https://github.com/search?l={language}&p={i}&q=stars%3A%3E0&s=stars&type=Repositories'

            urls.append(url)
    return urls

In [74]:
def get_all_urls(urls):
    '''
    This function scrapes all of the urls from
    the list of github search result urls and returns a list of urls.
    '''
    
    repo_urls = []
    n = 0
    for url in urls:
        # Make request and soup object using helper
        soup = make_soup(url)
        sleep(3)
        n = n + 1
        print(f"Scraping loop number {n}")
        # Create a list of the anchor elements that hold the urls.
        urls_list = soup.find_all('a', class_='v-align-middle')
    
        # I'm using a set comprehension to return only unique urls.
        urls_set = {'https://github.com' + link.get('href') for link in urls_list}
        urls_set = list(urls_set)
        repo_urls.extend(urls_set)

    # I'm converting my set to a list of urls.
    # urls = list(urls) 
        
    return repo_urls

### The Plan:

Run through this function process twice; once for the javascript and python. That *should* give us two separate lists of individual repo urls that are 180 urls long each; then we can concat those two lists into a complete list of 360 unique repo urls (theoretically).

In [75]:
# Getting Javascript search results:

js_urls = get_single_language_urls('JavaScript')
len(js_urls)

39

In [76]:
# Now getting the individual repo urls from the above javascript search results:

js_repos = get_all_urls(js_urls)
len(js_repos)

Scraping loop number 1
Scraping loop number 2
Scraping loop number 3
Scraping loop number 4
Scraping loop number 5
Scraping loop number 6
Scraping loop number 7
Scraping loop number 8
Scraping loop number 9
Scraping loop number 10
Scraping loop number 11
Scraping loop number 12
Scraping loop number 13
Scraping loop number 14
Scraping loop number 15
Scraping loop number 16
Scraping loop number 17
Scraping loop number 18
Scraping loop number 19
Scraping loop number 20
Scraping loop number 21
Scraping loop number 22
Scraping loop number 23
Scraping loop number 24
Scraping loop number 25
Scraping loop number 26
Scraping loop number 27
Scraping loop number 28
Scraping loop number 29
Scraping loop number 30
Scraping loop number 31
Scraping loop number 32
Scraping loop number 33
Scraping loop number 34
Scraping loop number 35
Scraping loop number 36
Scraping loop number 37
Scraping loop number 38
Scraping loop number 39


230

In [77]:
js_repos

['https://github.com/facebook/react-native',
 'https://github.com/twbs/bootstrap',
 'https://github.com/vuejs/vue',
 'https://github.com/facebook/create-react-app',
 'https://github.com/d3/d3',
 'https://github.com/trekhleb/javascript-algorithms',
 'https://github.com/facebook/react',
 'https://github.com/axios/axios',
 'https://github.com/airbnb/javascript',
 'https://github.com/freeCodeCamp/freeCodeCamp',
 'https://github.com/jquery/jquery',
 'https://github.com/angular/angular.js',
 'https://github.com/mrdoob/three.js',
 'https://github.com/vercel/next.js',
 'https://github.com/goldbergyoni/nodebestpractices',
 'https://github.com/FortAwesome/Font-Awesome',
 'https://github.com/webpack/webpack',
 'https://github.com/nodejs/node',
 'https://github.com/mui-org/material-ui',
 'https://github.com/30-seconds/30-seconds-of-code',
 'https://github.com/chartjs/Chart.js',
 'https://github.com/adam-p/markdown-here',
 'https://github.com/hakimel/reveal.js',
 'https://github.com/Semantic-Org/Se

In [78]:
# Now doing the python repo list:

py_urls = get_single_language_urls('Python')
len(py_urls)

39

In [79]:
# Now getting the individual repo urls from the above javascript search results:

py_repos = get_all_urls(py_urls)
len(py_repos)

Scraping loop number 1
Scraping loop number 2
Scraping loop number 3
Scraping loop number 4
Scraping loop number 5
Scraping loop number 6
Scraping loop number 7
Scraping loop number 8
Scraping loop number 9
Scraping loop number 10
Scraping loop number 11
Scraping loop number 12
Scraping loop number 13
Scraping loop number 14
Scraping loop number 15
Scraping loop number 16
Scraping loop number 17
Scraping loop number 18
Scraping loop number 19
Scraping loop number 20
Scraping loop number 21
Scraping loop number 22
Scraping loop number 23
Scraping loop number 24
Scraping loop number 25
Scraping loop number 26
Scraping loop number 27
Scraping loop number 28
Scraping loop number 29
Scraping loop number 30
Scraping loop number 31
Scraping loop number 32
Scraping loop number 33
Scraping loop number 34
Scraping loop number 35
Scraping loop number 36
Scraping loop number 37
Scraping loop number 38
Scraping loop number 39


180

In [80]:
py_repos

['https://github.com/ytdl-org/youtube-dl',
 'https://github.com/jackfrued/Python-100-Days',
 'https://github.com/pallets/flask',
 'https://github.com/django/django',
 'https://github.com/TheAlgorithms/Python',
 'https://github.com/public-apis/public-apis',
 'https://github.com/tensorflow/models',
 'https://github.com/vinta/awesome-python',
 'https://github.com/donnemartin/system-design-primer',
 'https://github.com/nvbn/thefuck',
 'https://github.com/josephmisiti/awesome-machine-learning',
 'https://github.com/psf/requests',
 'https://github.com/scikit-learn/scikit-learn',
 'https://github.com/minimaxir/big-list-of-naughty-strings',
 'https://github.com/httpie/httpie',
 'https://github.com/soimort/you-get',
 'https://github.com/ageitgey/face_recognition',
 'https://github.com/ansible/ansible',
 'https://github.com/scrapy/scrapy',
 'https://github.com/home-assistant/core',
 'https://github.com/huggingface/transformers',
 'https://github.com/isocpp/CppCoreGuidelines',
 'https://github.co

In [81]:
# Joining the two lists together...

joined_repos = py_repos + js_repos

In [82]:
len(joined_repos)

410

In [85]:
joined_repos

['https://github.com/ytdl-org/youtube-dl',
 'https://github.com/jackfrued/Python-100-Days',
 'https://github.com/pallets/flask',
 'https://github.com/django/django',
 'https://github.com/TheAlgorithms/Python',
 'https://github.com/public-apis/public-apis',
 'https://github.com/tensorflow/models',
 'https://github.com/vinta/awesome-python',
 'https://github.com/donnemartin/system-design-primer',
 'https://github.com/nvbn/thefuck',
 'https://github.com/josephmisiti/awesome-machine-learning',
 'https://github.com/psf/requests',
 'https://github.com/scikit-learn/scikit-learn',
 'https://github.com/minimaxir/big-list-of-naughty-strings',
 'https://github.com/httpie/httpie',
 'https://github.com/soimort/you-get',
 'https://github.com/ageitgey/face_recognition',
 'https://github.com/ansible/ansible',
 'https://github.com/scrapy/scrapy',
 'https://github.com/home-assistant/core',
 'https://github.com/huggingface/transformers',
 'https://github.com/isocpp/CppCoreGuidelines',
 'https://github.co

In [86]:
import github_acquire

credentials loaded successfully
git acquire module loaded successsfully


In [94]:
new_list = []

for item in joined_repos:
    new_list.append(item[19:])

In [95]:
new_list

['ytdl-org/youtube-dl',
 'jackfrued/Python-100-Days',
 'pallets/flask',
 'django/django',
 'TheAlgorithms/Python',
 'public-apis/public-apis',
 'tensorflow/models',
 'vinta/awesome-python',
 'donnemartin/system-design-primer',
 'nvbn/thefuck',
 'josephmisiti/awesome-machine-learning',
 'psf/requests',
 'scikit-learn/scikit-learn',
 'minimaxir/big-list-of-naughty-strings',
 'httpie/httpie',
 'soimort/you-get',
 'ageitgey/face_recognition',
 'ansible/ansible',
 'scrapy/scrapy',
 'home-assistant/core',
 'huggingface/transformers',
 'isocpp/CppCoreGuidelines',
 'shadowsocks/shadowsocks',
 'python/cpython',
 'deepfakes/faceswap',
 'testerSunshine/12306',
 '0voice/interview_internal_reference',
 'apache/incubator-superset',
 'XX-net/XX-Net',
 '521xueweihan/HelloGitHub',
 'getsentry/sentry',
 'apachecn/AiLearning',
 'fighting41love/funNLP',
 'certbot/certbot',
 '3b1b/manim',
 'localstack/localstack',
 'floodsung/Deep-Learning-Papers-Reading-Roadmap',
 'faif/python-patterns',
 'google-research

In [88]:
count_len = joined_repos[0]

In [93]:
count_len[19:]

'ytdl-org/youtube-dl'

In [96]:
"""
A module for obtaining repo readme and language data from the github API.
Before using this module, read through it, and follow the instructions marked
TODO.
After doing so, run it like this:
    python acquire.py
To create the `data.json` file that contains the data.
"""
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from env import github_token, github_username

# TODO: Make a github personal access token.
#     1. Go here and generate a personal access token https://github.com/settings/tokens
#        You do _not_ need select any scopes, i.e. leave all the checkboxes unchecked
#     2. Save it in your env.py file under the variable `github_token`
# TODO: Add your github username to your env.py file under the variable `github_username`
# TODO: Add more repositories to the `REPOS` list below.

REPOS = new_list

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        return repo_info.get("language", None)
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = None
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data.json", "w"), indent=1)