In [1]:
import numpy as np
import pandas as pd
import json
import warnings
warnings.filterwarnings("ignore")

from acquire import get_language_urls, get_all_urls, get_repo_content
from prepare import prep_repo_data

from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

In [2]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Sir Galahad'} 
    response = get(url, headers=headers)    
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [3]:
def get_language_urls(n,m):
    '''
    This function scrapes the GH search results for most starred 
    pages from each language from page n to m pages of each 
    and returns a list of urls.
    '''
    # create empty list to hold urls
    urls = []
    # create list of languages to search for
    languages = ['JavaScript', 'Python']
    # loop through the languages
    for language in languages:
        # loop throught the page numbers
        for i in range(n,m+1):
            # each page for most starred repos on GH
            url = f'https://github.com/search?l={language}&p={i}&q=stars%3A%3E0&s=stars&type=Repositories'
            # append the url to the urls list
            urls.append(url)
    return urls

In [4]:
# first 15 search pages of urls for JS and Python
#search_urls = get_language_urls(1,15)
#search_urls

In [36]:
# second 15 search pages of urls for JS and Python
search_urls2 =get_language_urls(16, 30)
#search_urls2

In [37]:
def get_all_urls(urls):
    '''
    This function scrapes all of the urls from
    the GH search results pages and returns a complete list of urls.
    '''
    # create empty list
    repo_urls = []
    n=0
    # loop through each url in urls list
    for url in urls:
        # Make request and soup object using helper function
        soup = make_soup(url)
        # delay 1 second between fetch
        sleep(8)
        n = n + 1
        print(f"Scraping loop number {n}")
        # Create a list of the anchor elements that hold the urls.
        urls_list = soup.find_all('a', class_='v-align-middle')
        # I'm using a set comprehension to return only unique urls.
        urls_set = {'https://github.com' + link.get('href') for link in urls_list}
        # I'm converting my set to a list of urls.
        urls_set = list(urls_set)
        # extend the list with a new url as an element
        repo_urls.extend(urls_set)        
    return repo_urls

In [38]:
#all_urls = get_all_urls(search_urls)
all_urls2 = get_all_urls(search_urls2)

Scraping loop number 1
Scraping loop number 2
Scraping loop number 3
Scraping loop number 4
Scraping loop number 5
Scraping loop number 6
Scraping loop number 7
Scraping loop number 8
Scraping loop number 9
Scraping loop number 10
Scraping loop number 11
Scraping loop number 12
Scraping loop number 13
Scraping loop number 14
Scraping loop number 15
Scraping loop number 16
Scraping loop number 17
Scraping loop number 18
Scraping loop number 19
Scraping loop number 20
Scraping loop number 21
Scraping loop number 22
Scraping loop number 23
Scraping loop number 24
Scraping loop number 25
Scraping loop number 26
Scraping loop number 27
Scraping loop number 28
Scraping loop number 29
Scraping loop number 30


In [10]:
#len(all_urls)

In [39]:
len(all_urls2)

300

In [12]:
#list1 = all_urls[0:101]
#list2 = all_urls[101:201]
#list3 = all_urls[201:301]
#list1 + list2 + list3 == all_urls

In [41]:
list4 = all_urls2[0:100]
list5 = all_urls2[100:200]
list6 = all_urls2[200:301]
list4 + list5 + list6 == all_urls2

True

In [62]:
len(list4), len(list5), len(list6)

(100, 99, 100)

In [63]:
def get_repo_content(urls, cached=False):
    '''
    This function takes in a list of Github urls and a parameter
    with default cached == False which scrapes the language and  
    readme text for each url, creates a list of dictionaries with 
    the title and text for each blog, converts list to df, and returns 
    df. If cached == True, the function returns a df from a json file.
    '''
    if cached == True:
        df = pd.read_json('gh_repos.json')
        
    # cached == False completes a fresh scrape for df     
    else:

        # Create an empty list to hold dictionaries
        articles = []
        n=0
        # Loop through each url in our list of urls
        for url in urls:

            # Make request and soup object using helper
            soup = make_soup(url)
            sleep(1)
            n = n + 1
            print(f"Scraping loop number {n}")
            
            # Save the programming language of each repo in variable language
            language = soup.find('span', class_='text-gray-dark text-bold mr-1').text

            # Save the repo sub url
            repo = url[19:]
            
            # Save the text in each repo to variable content
            content = soup.find('article', class_="markdown-body entry-content container-lg").text

            # Create a dictionary holding the title and content for each blog
            article = {'language': language, 'repo': repo, 'content': content}

            # Add each dictionary to the articles list of dictionaries
            articles.append(article)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(articles)

        # Write df to a json file for faster access
        df.to_json('gh_repos5.json')
    
    return df

In [15]:
#gh_repos.json
#df1 = get_repo_content(list1)

In [16]:
#gh_repos2.json
#df2 = get_repo_content(list2)

In [17]:
#gh_repos3.json
#df3 = get_repo_content(list3)

In [47]:
#gh_repos4.json
#df4 = get_repo_content(list4)

In [89]:
#gh_repos5.json
#df5 = get_repo_content(list5)

In [88]:
#gh_repos6.json
#df6 = get_repo_content(list6)

In [68]:
df6.shape

(100, 3)

In [26]:
df1 = pd.read_json('gh_repos.json')
df2 = pd.read_json('gh_repos2.json')
df3 = pd.read_json('gh_repos3.json')
df4 = pd.read_json('gh_repos4.json')
df5 = pd.read_json('gh_repos5.json')
df6 = pd.read_json('gh_repos6.json')

In [79]:
df = pd.concat([df1, df2, df3, df4, df5, df6]).reset_index().drop(columns='index')

In [80]:
df.head()

Unnamed: 0,language,repo,content
0,JavaScript,facebook/react,React · \nReact is a JavaScript library for...
1,JavaScript,d3/d3,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...
2,JavaScript,vuejs/vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...
3,JavaScript,axios/axios,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...
4,JavaScript,facebook/create-react-app,Create React App \n\nCreate React apps with n...


In [81]:
df.repo.value_counts()

idank/explainshell          2
sivel/speedtest-cli         2
kovidgoyal/calibre          2
microsoft/recommenders      2
pytorch/fairseq             2
                           ..
highlightjs/highlight.js    1
facebook/draft-js           1
mysqljs/mysql               1
harelba/q                   1
pallets/click               1
Name: repo, Length: 581, dtype: int64

In [82]:
df.to_json('repos.json')

In [84]:
prep_df = prep_repo_data(df, 'content')

In [85]:
prep_df.to_json('repos_clean.json')

In [86]:
prep_df.shape

(581, 6)

In [87]:
prep_df.head()

Unnamed: 0,language,repo,content,stemmed,lemmatized,clean
0,JavaScript,facebook/react,React · \nReact is a JavaScript library for...,react react is a javascript librari for build ...,react react is a javascript library for buildi...,react react javascript library building user i...
1,JavaScript,d3/d3,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...,d3 datadriven document d3 or d3j is a javascri...,d3 datadriven document d3 or d3js is a javascr...,d3 datadriven document d3 d3js javascript libr...
2,JavaScript,vuejs/vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...,support vuej vuej is an mitlicens open sourc p...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...
3,JavaScript,axios/axios,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...,axio promis base http client for the browser a...,axios promise based http client for the browse...,axios promise based http client browser nodejs...
4,JavaScript,facebook/create-react-app,Create React App \n\nCreate React apps with n...,creat react app creat react app with no build ...,create react app create react apps with no bui...,create react app create react apps build confi...
