### GitHub README Extraction 

During this summer, the DSGP OSS 2021 team willbe classifying GitHub repositores into different software types. To do this, we will be extracting README files from all of the OSS repos (i.e. those with OSS licenses) and then developing NLP techniques to classify those repos. In this file, we document the extraction process for GitHub README files. 

First, we load our packages 

In [6]:
# load packages 
import os
import glob
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
import requests as r
import string 
import json
import base64
import urllib.request
import itertools 
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import time
from functools import wraps
import urllib.request
import multiprocessing
from multiprocessing.pool import ThreadPool as Pool
from retrying import retry
from datetime import datetime

print("ready")

ready


Next, we will grab our data from the database. 

In [10]:
# connect to the database, download data 
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked where commits > 1000 AND status != 'Init' '''

# convert to a dataframe, show how many missing we have (none)
raw_slug_data = pd.read_sql_query(raw_slug_data, con=connection)
raw_slug_data.head()

Unnamed: 0,id,spdx,slug,createdat,description,primarylanguage,branch,commits,asof,status
0,MDEwOlJlcG9zaXRvcnk5NDkwMTU1Mw==,ECL-2.0,juancho85/opencast,2017-06-20 14:39:56,The free and open source solution for automate...,Java,MDM6UmVmOTQ5MDE1NTM6cmVmcy9oZWFkcy9kZXZlbG9w,7715,2021-01-03 15:59:42,Done
1,MDEwOlJlcG9zaXRvcnkxMzEwNzEzMjk=,GPL-2.0,jeffrizzo/openocd,2018-04-25 22:42:24,openocd patches,C,MDM6UmVmMTMxMDcxMzI5OnJlZnMvaGVhZHMvbWFzdGVy,7714,2021-01-03 18:44:52,Done
2,MDEwOlJlcG9zaXRvcnkzMDk5NTI1MA==,MIT,supcoin/supcoin,2015-02-19 00:47:12,Supcoin release wallet (NOTE: will not sync un...,C++,MDM6UmVmMzA5OTUyNTA6cmVmcy9oZWFkcy9tYXN0ZXI=,7709,2021-01-03 15:54:35,Done
3,MDEwOlJlcG9zaXRvcnk5MjE4NDg3Mg==,GPL-3.0,bigearth/clone-marlin,2017-05-23 14:50:32,Clone's marlin build,C,MDM6UmVmOTIxODQ4NzI6cmVmcy9oZWFkcy9tYXN0ZXI=,7707,2021-01-03 20:04:50,Done
4,MDEwOlJlcG9zaXRvcnkxMzc2NjEyMTE=,GPL-2.0,gemian/connman,2018-06-17 14:30:28,Connection Manager - building 'master' for bot...,C,MDM6UmVmMTM3NjYxMjExOnJlZnMvaGVhZHMvbWFzdGVy,7707,2021-01-03 18:02:13,Done


In [11]:
raw_slug_data['slug'].count

<bound method Series.count of 0              juancho85/opencast
1               jeffrizzo/openocd
2                 supcoin/supcoin
3           bigearth/clone-marlin
4                  gemian/connman
                  ...            
927           bbvv77/bitwarden_rs
928    helsing97/Unidata-Workshop
929     enricomoko/mokotestappgit
930     manishsangwan/docker-test
931           Legion112/funfunapp
Name: slug, Length: 932, dtype: object>

In [12]:
batch = "batch3"

raw_slugs = raw_slug_data["slug"].tolist()
slugs = []
for s in raw_slugs:
    slugs.append(s.strip())

# exp multiplier that waits 30 seconds, then 30 secs + 30 secs, etc until 5 tries is over 
@retry(stop_max_attempt_number=5, wait_exponential_multiplier=30000, wait_exponential_max=30000) 
def scrape_slugs(slug):
    ''' Scrapes slugs with multiple threads '''
    url = f'https://github.com/{slug}#readme'
    split_slugs = slug.split("/")
    login = split_slugs[0]
    repo = split_slugs[1]
    fullfilename = os.path.join('/project/class/bii_sdad_dspg/ncses_oss_2021/', f'readme_{login}_{repo}.txt')
    urllib.request.urlretrieve(url, fullfilename)
    now = datetime.now()
    current_time = now.strftime("%Y-%m-%d %H:%M:%S")
    return slug, current_time, "Done"

cores_available = multiprocessing.cpu_count() - 1
pool = Pool(cores_available)

logging_slugs = []
logging_time = []
logging_status = []
for result in pool.imap_unordered(scrape_slugs, slugs):
    logging_slugs.append(result[0])
    logging_time.append(result[1])
    logging_status.append(result[2])
    logging_df = pd.DataFrame({'slug': logging_slugs, "batch": batch, 'as_of': logging_time, 'status': logging_status}, 
                              columns=["slug", "batch", "as_of", "status"])
    logging_df.to_csv('/project/class/bii_sdad_dspg/ncses_oss_2021/'+batch+'.csv', index=False)
    print(result)

('SuperElastix/SimpleElastix', '08:09:15', 'Done')
('LivelyKernel/lively.next', '08:09:15', 'Done')
('jeffrizzo/openocd', '08:09:15', 'Done')
('shitolepriya/test-frappe', '08:09:15', 'Done')
('bigearth/clone-marlin', '08:09:15', 'Done')
('Rentu/redis-read', '08:09:15', 'Done')
('jandockx/ppwcode-recovered-from-google-code', '08:09:15', 'Done')
('juancho85/opencast', '08:09:15', 'Done')
('paleobiodb/data_service', '08:09:15', 'Done')
('gemian/connman', '08:09:15', 'Done')
('AltugYildirim/istio', '08:09:15', 'Done')
('supcoin/supcoin', '08:09:15', 'Done')
('levants/lightmare', '08:09:15', 'Done')
('KnothHe/redis-5.0', '08:09:15', 'Done')
('mshbeab/anter', '08:09:15', 'Done')
('zraul123/keytekia', '08:09:15', 'Done')
('ajaysharma00/mastodon', '08:09:15', 'Done')
('puneetgkaur/backup_sugar_sugartoolkit', '08:09:15', 'Done')
('Lordsm926/l', '08:09:15', 'Done')
('addb-swstarlab/ParallelAOF', '08:09:15', 'Done')
('keyteki/keyteki', '08:09:15', 'Done')
('qingqibing/libcxx', '08:09:15', 'Done')

HTTPError: HTTP Error 404: Not Found

In [60]:
pd.read_csv('data.csv')



'uva-bi-sdad/oss-2020'

In [12]:
slugs = ["brandonleekramer/diversity", 
         "uva-bi-sdad/oss-2020", 
         "facebook/react", 
         "RichardLitt/standard-readme"
] 



slugs = ["brandonleekramer/diversity", "uva-bi-sdad/oss-2020", "facebook/react", "RichardLitt/standard-readme"] #test data 
#slugs = raw_slug_data.slug.tolist()

#myPath = '/sfs/qumulo/qhome/kb7hp/git/oss-2020/src/09_repository-scraping/' 
myPath = '/project/class/bii_sdad_dspg/ncses_oss_2021/'

for slug in slugs:
    url = f'https://github.com/{slug}#readme'
    split_slugs = slug.split("/")
    login = split_slugs[0]
    repo = split_slugs[1]
    fullfilename = os.path.join(myPath, f'readme_{login}_{repo}.txt')
    urllib.request.urlretrieve(url, fullfilename)
    print(f'Finished scraping: {login}/{repo}')

Finished scraping: brandonleekramer/diversity
Finished scraping: uva-bi-sdad/oss-2020
Finished scraping: facebook/react
Finished scraping: RichardLitt/standard-readme


Note to Crystal: 

The function above this note works for small-scale scraping but we need to add in the rate limit on API calls before we scale up. 
https://stackoverflow.com/questions/40748687/python-api-rate-limiting-how-to-limit-api-calls-globally

We could also try to add in multiprocessing to speed things up. I'm not sure this link is the right one, but we can chat more about that.
https://stackoverflow.com/questions/54858979/how-to-use-multiprocessing-with-requests-module

In [19]:
myPath = '/project/class/bii_sdad_dspg/ncses_oss_2021/'

repo_name = []
readme_text = [] 
for filename in os.listdir(myPath):
    if filename.endswith('.txt'):
        with open(os.path.join(myPath, filename)) as f:
            content = f.read()
            soup = BeautifulSoup(content, 'html.parser')
            if soup.article is not None:
                clean_html = ''.join(soup.article.findAll(text=True))
            else: 
                clean_html = None 
            repo_name.append(filename)
            readme_text.append(clean_html)
            df = pd.DataFrame({'slug': repo_name, 'readme_text': readme_text}, columns=["slug", "readme_text"])
            df['slug'] = df['slug'].str.replace('readme_','')
            df['slug'] = df['slug'].str.replace('.txt','')
            # this works because slugs can't have underscores
            df['slug'] = df['slug'].str.replace('_','/') 
df 

Unnamed: 0,slug,readme_text
0,Vinetos/emacs,
1,uemanet/eskada,
2,fieldstar/FoodRiskTracing,BfROpenLab\n
3,Nkosi-tshawe/moodle,
4,joshblack/carbon-experimental,carbon-experimental\n\nAll work shown here is ...
...,...,...
2091,ualberta-eclass/moodle-block/demostudent,
2092,MircomBAS/vscode,"Visual Studio Code - Open Source (""Code - OSS""..."
2093,morenfang/tensorflow,\n\n\n\n\n\nDocumentation\n\n\n\n\n\n\n\n\nTen...
2094,jissong/AliRoot,


In [26]:
os.chdir('/project/class/bii_sdad_dspg/ncses_oss_2021/')
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv

Unnamed: 0,slug,batch,as_of,status
0,NixOS/nixpkgs,batch1,16:57:57,Done
1,nanosoftsystem/ai,batch1,16:57:57,Done
2,google/llvm-propeller,batch1,16:57:57,Done
3,eugene-matvejev/ultimate-commit-machine,batch1,16:57:57,Done
4,curtclifton/curtclifton.github.io,batch1,16:57:57,Done
...,...,...,...,...
212,kkman2008/coder-Leetcode,batch2,8:10:18,Done
213,starling021/uh,batch2,8:10:18,Done
214,dcgavril/Booked,batch2,8:10:18,Done
215,sathyanlm/k3,batch2,8:10:18,Done


In [27]:
merged_df = df.merge(combined_csv, how='inner', on='slug')
merged_df

Unnamed: 0,slug,readme_text,batch,as_of,status
0,Vinetos/emacs,,batch1,16:59:00,Done
1,joshblack/carbon-experimental,carbon-experimental\n\nAll work shown here is ...,batch2,8:09:16,Done
2,ledermann/docker-rails,Docker-Rails\n\n\nSimple Rails 6.1 application...,batch2,8:09:16,Done
3,jackjack821/pytest,\n\n\n\n\n\n\n\n\n\n\n\nThe pytest framework m...,batch2,8:09:15,Done
4,DamienIrving/ocean-analysis,README\nThis repository contains the code I us...,batch2,8:09:16,Done
...,...,...,...,...,...
365,gemian/connman,,batch2,8:09:15,Done
366,keyteki/keyteki,Keyteki\nWeb based implementation of Keyforge:...,batch2,8:09:15,Done
367,tsuu32/emacs-w32con-vt,Emacs with 256/true color support in Windows C...,batch1,16:58:59,Done
368,AjayBrahmakshatriya/llvm-bb-emitter,,batch1,16:57:59,Done


In [45]:
db_user = os.environ.get('db_user')
db_pwd = os.environ.get('db_pwd')
engine = create_engine('postgresql+psycopg2://{db_user}:{db_pwd}@postgis1/sdad')
merged_df.to_sql('repos_scraped', engine, schema='gh_2007_2020', if_exists='replace', index=False)
print("It worked")

It worked


We need to write this to the database now... 

Try this: https://medium.com/analytics-vidhya/part-3-5-pandas-dataframe-to-postgresql-using-python-d3bc41fcf39 

https://pybay.com/site_media/slides/raymond2017-keynote/process.html

https://github.com/rholder/retrying 

https://stackoverflow.com/questions/45016816/updating-row-using-sqlalchemy-and-scrapy

https://stackoverflow.com/questions/2712524/handling-urllib2s-timeout-python

In [46]:
# connect to the database, download data 
connection = pg.connect(host = 'postgis1', 
                        database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

repos_scraped = '''SELECT * FROM gh_2007_2020.repos_scraped '''

# convert to a dataframe, show how many missing we have (none)
repos_scraped = pd.read_sql_query(repos_scraped, con=connection)
repos_scraped.head()

Unnamed: 0,slug,readme_text,batch,as_of,status
0,Vinetos/emacs,,batch1,16:59:00,Done
1,joshblack/carbon-experimental,carbon-experimental\n\nAll work shown here is ...,batch2,8:09:16,Done
2,ledermann/docker-rails,Docker-Rails\n\n\nSimple Rails 6.1 application...,batch2,8:09:16,Done
3,jackjack821/pytest,\n\n\n\n\n\n\n\n\n\n\n\nThe pytest framework m...,batch2,8:09:15,Done
4,DamienIrving/ocean-analysis,README\nThis repository contains the code I us...,batch2,8:09:16,Done
