In [91]:
import numpy as np
import pandas as pd
import json
import warnings
warnings.filterwarnings("ignore")

from acquire import get_language_urls, get_all_urls, get_repo_content
from prepare import prep_repo_data

from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

In [92]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Sir Galahad'} 
    response = get(url, headers=headers)    
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [93]:
def get_language_urls(n,m):
    '''
    This function scrapes the GH search results for most starred 
    pages from each language from page n to m pages of each 
    and returns a list of urls.
    '''
    # create empty list to hold urls
    urls = []
    # create list of languages to search for
    languages = ['JavaScript', 'Python']
    # loop through the languages
    for language in languages:
        # loop throught the page numbers
        for i in range(n,m+1):
            # each page for most starred repos on GH
            url = f'https://github.com/search?l={language}&p={i}&q=stars%3A%3E0&s=stars&type=Repositories'
            # append the url to the urls list
            urls.append(url)
    return urls

In [94]:
# first 15 search pages of urls for JS and Python
#search_urls = get_language_urls(1,15)
#search_urls

In [95]:
# second 15 search pages of urls for JS and Python
#search_urls2 =get_language_urls(16, 30)
#search_urls2

In [96]:
# next 10 pages of urls for JS and Python
search_urls3 = get_language_urls(31, 40)
search_urls3

['https://github.com/search?l=JavaScript&p=31&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=32&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=33&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=34&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=35&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=36&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=37&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=38&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=39&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=40&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=Python&p=31&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/sear

In [97]:
def get_all_urls(urls):
    '''
    This function scrapes all of the urls from
    the GH search results pages and returns a complete list of urls.
    '''
    # create empty list
    repo_urls = []
    n=0
    # loop through each url in urls list
    for url in urls:
        # Make request and soup object using helper function
        soup = make_soup(url)
        # delay 1 second between fetch
        sleep(8)
        n = n + 1
        print(f"Scraping loop number {n}")
        # Create a list of the anchor elements that hold the urls.
        urls_list = soup.find_all('a', class_='v-align-middle')
        # I'm using a set comprehension to return only unique urls.
        urls_set = {'https://github.com' + link.get('href') for link in urls_list}
        # I'm converting my set to a list of urls.
        urls_set = list(urls_set)
        # extend the list with a new url as an element
        repo_urls.extend(urls_set)        
    return repo_urls

In [98]:
#all_urls = get_all_urls(search_urls)
#all_urls2 = get_all_urls(search_urls2)
all_urls3 = get_all_urls(search_urls3)

Scraping loop number 1
Scraping loop number 2
Scraping loop number 3
Scraping loop number 4
Scraping loop number 5
Scraping loop number 6
Scraping loop number 7
Scraping loop number 8
Scraping loop number 9
Scraping loop number 10
Scraping loop number 11
Scraping loop number 12
Scraping loop number 13
Scraping loop number 14
Scraping loop number 15
Scraping loop number 16
Scraping loop number 17
Scraping loop number 18
Scraping loop number 19
Scraping loop number 20


In [99]:
#len(all_urls)

In [100]:
#len(all_urls2)

In [101]:
len(all_urls3)

200

In [102]:
#list1 = all_urls[0:101]
#list2 = all_urls[101:201]
#list3 = all_urls[201:301]
#list1 + list2 + list3 == all_urls

In [103]:
#list4 = all_urls2[0:100]
#list5 = all_urls2[100:200]
#list6 = all_urls2[200:301]
#list4 + list5 + list6 == all_urls2

In [104]:
#len(list4), len(list5), len(list6)

In [105]:
def get_repo_content(urls, cached=False):
    '''
    This function takes in a list of Github urls and a parameter
    with default cached == False which scrapes the language and  
    readme text for each url, creates a list of dictionaries with 
    the title and text for each blog, converts list to df, and returns 
    df. If cached == True, the function returns a df from a json file.
    '''
    if cached == True:
        df = pd.read_json('gh_repos.json')
        
    # cached == False completes a fresh scrape for df     
    else:

        # Create an empty list to hold dictionaries
        articles = []
        n=0
        # Loop through each url in our list of urls
        for url in urls:

            # Make request and soup object using helper
            soup = make_soup(url)
            sleep(1)
            n = n + 1
            print(f"Scraping loop number {n}")
            
            # Save the programming language of each repo in variable language
            language = soup.find('span', class_='text-gray-dark text-bold mr-1').text

            # Save the repo sub url
            repo = url[19:]
            
            # Save the text in each repo to variable content
            content = soup.find('article', class_="markdown-body entry-content container-lg").text

            # Create a dictionary holding the title and content for each blog
            article = {'language': language, 'repo': repo, 'content': content}

            # Add each dictionary to the articles list of dictionaries
            articles.append(article)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(articles)

        # Write df to a json file for faster access
        df.to_json('gh_repos_test.json')
    
    return df

In [106]:
#gh_repos.json
#df1 = get_repo_content(list1)

In [107]:
#gh_repos2.json
#df2 = get_repo_content(list2)

In [108]:
#gh_repos3.json
#df3 = get_repo_content(list3)

In [109]:
#gh_repos4.json
#df4 = get_repo_content(list4)

In [110]:
#gh_repos5.json
#df5 = get_repo_content(list5)

In [111]:
#gh_repos6.json
#df6 = get_repo_content(list6)

In [112]:
#gh_repos_test.json
df_test = get_repo_content(all_urls3)

Scraping loop number 1
Scraping loop number 2
Scraping loop number 3
Scraping loop number 4
Scraping loop number 5
Scraping loop number 6
Scraping loop number 7
Scraping loop number 8
Scraping loop number 9
Scraping loop number 10
Scraping loop number 11
Scraping loop number 12
Scraping loop number 13
Scraping loop number 14
Scraping loop number 15
Scraping loop number 16
Scraping loop number 17
Scraping loop number 18
Scraping loop number 19
Scraping loop number 20
Scraping loop number 21
Scraping loop number 22
Scraping loop number 23
Scraping loop number 24
Scraping loop number 25
Scraping loop number 26
Scraping loop number 27
Scraping loop number 28
Scraping loop number 29
Scraping loop number 30
Scraping loop number 31
Scraping loop number 32
Scraping loop number 33
Scraping loop number 34
Scraping loop number 35
Scraping loop number 36
Scraping loop number 37
Scraping loop number 38
Scraping loop number 39
Scraping loop number 40
Scraping loop number 41
Scraping loop number 42
S

In [113]:
df_test.shape

(200, 3)

In [114]:
#df = pd.concat([df1, df2, df3, df4, df5, df6]).reset_index().drop(columns='index')

In [115]:
df_test.head()

Unnamed: 0,language,repo,content
0,JavaScript,knex/knex,knex.js\n\n\n\n\n\n\n\n\nA SQL query builder t...
1,JavaScript,substack/stream-handbook,stream-handbook\nThis document covers the basi...
2,JavaScript,angular/angular-seed,angular-seed — the seed for AngularJS apps\nTh...
3,JavaScript,qianguyihao/Web,项目介绍\n项目地址：https://github.com/qianguyihao/Web\...
4,JavaScript,browserify/browserify,browserify\nrequire('modules') in the browser\...


In [116]:
df_test.repo.value_counts()

MorvanZhou/Reinforcement-learning-with-tensorflow    1
facebookresearch/pytext                              1
newsapps/beeswithmachineguns                         1
qqwweee/keras-yolo3                                  1
netlify/netlify-cms                                  1
                                                    ..
ajenti/ajenti                                        1
pyeve/eve                                            1
infinitered/reactotron                               1
bup/bup                                              1
boto/boto3                                           1
Name: repo, Length: 200, dtype: int64

In [117]:
#df.to_json('repos.json')

In [118]:
prep_df = prep_repo_data(df_test, 'content')

In [119]:
prep_df.to_json('test_repos_clean.json')

In [120]:
prep_df.shape

(200, 6)

In [121]:
prep_df.head()

Unnamed: 0,language,repo,content,stemmed,lemmatized,clean
0,JavaScript,knex/knex,knex.js\n\n\n\n\n\n\n\n\nA SQL query builder t...,knexj a sql queri builder that is flexibl port...,knexjs a sql query builder that is flexible po...,knexjs sql query builder flexible portable fun...
1,JavaScript,substack/stream-handbook,stream-handbook\nThis document covers the basi...,streamhandbook thi document cover the basic of...,streamhandbook this document cover the basic o...,streamhandbook document cover basic write node...
2,JavaScript,angular/angular-seed,angular-seed — the seed for AngularJS apps\nTh...,angularse the seed for angularj app thi projec...,angularseed the seed for angularjs apps this p...,angularseed seed angularjs apps project applic...
3,JavaScript,qianguyihao/Web,项目介绍\n项目地址：https://github.com/qianguyihao/Web\...,httpsgithubcomqianguyihaoweb 1 2 3 androidweb ...,httpsgithubcomqianguyihaoweb 1 2 3 androidweb ...,httpsgithubcomqianguyihaoweb 1 2 3 androidweb ...
4,JavaScript,browserify/browserify,browserify\nrequire('modules') in the browser\...,browserifi requiremodul in the browser use a n...,browserify requiremodules in the browser use a...,browserify requiremodules browser use nodestyl...
