## Plan for acquire:

1. Pull most forked repos. There are over 4 million forked repos, so we can easily get as many repos as we need. 

In [1]:
import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Space for importing acquire/prep functions that I've saved.

# Acquire

In [None]:
# First off, going to try and grab the html from a single repo; then I'll try and go it for a full page of results.

In [44]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Sir Galahad'} 
    response = get(url, headers=headers)    
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [2]:
url = 'https://github.com/rdpeng/ProgrammingAssignment2'
headers = {'User-Agent': 'Codeup Data Science'} # Some websites don't accept the pyhon-requests default user-agent
response = get(url, headers=headers)

In [4]:
response.text

'\n\n\n\n\n\n<!DOCTYPE html>\n<html lang="en">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars0.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://avatars1.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://avatars2.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://avatars3.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n\n\n\n  <link crossorigin="anonymous" media="all" integrity="sha512-UzMqFF3u6q+PIf7vRROKCCcSAIYk0CGPD1MvMAnv0X7Pqxc6MTt+l1mXE6StaVvPg+m/XgXexi1uO8P/zszHgA==" rel="stylesheet" href="https://github.githubassets.com/assets/frameworks-53332a145deeeaaf8f21feef45138a08.css" />\n  <link crossorigin="anonymous" media="all" integrity="sha512-64lrKWJQDNfV0kiKLzdghJjhBJyD7/qXUKAmjXO+y4Xlnaav6upzJkp9hT

In [12]:
# Running the beautiful soup to get the parser setup.
soup = BeautifulSoup(response.content, 'html.parser')

In [13]:
soup.title.text

'GitHub - rdpeng/ProgrammingAssignment2: Repository for Programming Assignment 2 for R Programming on Coursera'

In [14]:
soup.find('span', class_ = 'text-gray-dark text-bold mr-1').text

'R'

In [15]:
# Now looking for the content of the page. Keep in mind I'm just running functions, I haven't assigned any variables yet, so I haven' "saved" my work in a repeatable way yet.

soup.find('div', class_ = 'Box-body px-5 pb-5').text

'\nIntroduction\nThis second programming assignment will require you to write an R\nfunction that is able to cache potentially time-consuming computations.\nFor example, taking the mean of a numeric vector is typically a fast\noperation. However, for a very long vector, it may take too long to\ncompute the mean, especially if it has to be computed repeatedly (e.g.\nin a loop). If the contents of a vector are not changing, it may make\nsense to cache the value of the mean so that when we need it again, it\ncan be looked up in the cache rather than recomputed. In this\nProgramming Assignment you will take advantage of the scoping rules of\nthe R language and how they can be manipulated to preserve state inside\nof an R object.\nExample: Caching the Mean of a Vector\nIn this example we introduce the <<- operator which can be used to\nassign a value to an object in an environment that is different from the\ncurrent environment. Below are two functions that are used to create a\nspecial obj

In [22]:
# This might actually be the code that I need to use to find the "entire" READme text:

soup.find('article', class_ = 'markdown-body entry-content container-lg').text

'Introduction\nThis second programming assignment will require you to write an R\nfunction that is able to cache potentially time-consuming computations.\nFor example, taking the mean of a numeric vector is typically a fast\noperation. However, for a very long vector, it may take too long to\ncompute the mean, especially if it has to be computed repeatedly (e.g.\nin a loop). If the contents of a vector are not changing, it may make\nsense to cache the value of the mean so that when we need it again, it\ncan be looked up in the cache rather than recomputed. In this\nProgramming Assignment you will take advantage of the scoping rules of\nthe R language and how they can be manipulated to preserve state inside\nof an R object.\nExample: Caching the Mean of a Vector\nIn this example we introduce the <<- operator which can be used to\nassign a value to an object in an environment that is different from the\ncurrent environment. Below are two functions that are used to create a\nspecial objec

In [34]:
# https://github.com/search?q=stars%3A%3E2+language%3Apython&type=Repositories

In [None]:
# https://github.com/search?l=Python&p=2&q=stars%3A%3E2+language%3Apython&type=Repositories

In [35]:
languages  = ['JavaScript', 'Python', 'Java', 'C++']

In [36]:
url = f'https://github.com/search?l={languages}&q=stars%3A%3E2&type=Repositories'

In [41]:
def get_url_pages():
    
    urls = []
    
    languages = ['JavaScript', 'Python', 'Java', 'C++']
    
    for language in languages:
        urls.append(f"https://github.com/search?l={languages}&q=stars%3A%3E2&type=Repositories")
    return urls

In [42]:
url_test = get_url_pages()

In [43]:
url_test

["https://github.com/search?l=['JavaScript', 'Python', 'Java', 'C++']&q=stars%3A%3E2&type=Repositories",
 "https://github.com/search?l=['JavaScript', 'Python', 'Java', 'C++']&q=stars%3A%3E2&type=Repositories",
 "https://github.com/search?l=['JavaScript', 'Python', 'Java', 'C++']&q=stars%3A%3E2&type=Repositories",
 "https://github.com/search?l=['JavaScript', 'Python', 'Java', 'C++']&q=stars%3A%3E2&type=Repositories"]