In [50]:
import os 
import glob
import json 
import lxml
import requests 
from requests.auth import HTTPBasicAuth
from bs4 import BeautifulSoup
from datetime import datetime
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
from ratelimit import limits, sleep_and_retry
import multiprocessing
from multiprocessing.pool import ThreadPool as Pool
print("ready")

ready


In [112]:
github_username = os.environ.get("GITHUB_USERNAME")
github_token = os.environ.get("GITHUB_TOKEN")

# can only make 2500 calls per hour 
# because the function calls twice each time 
PER_HOUR = 3600

@sleep_and_retry
@limits(calls=2500, period=PER_HOUR)
def scrape_readmes(slug):
    
    # define url based on the slug 
    url = f'https://api.github.com/repos/{slug}/readme'
    response = requests.get(url, auth=(github_username, github_token))
    
    response_code = response.status_code
    if response_code == 404: 
        print(f"404 error on {slug}")
        readme_string = "404 ERROR - NO README"
        now = datetime.now()
        current_time = now.strftime("%Y-%m-%d %H:%M:%S")
        return slug, readme_string, current_time, "Done"
    
    elif response_code == 403:
        now = datetime.now()
        current_time = now.strftime("%Y-%m-%d %H:%M:%S")
        return print(f"403 error on {slug} at {current_time}")
        
    elif response_code != 200:
        raise Exception(response.status_code, response.text)
        return 
    
    elif response_code == 200:
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')
        site_json=json.loads(soup.text)
        readme_link = site_json['download_url']
    
        # then scrape the readme 
        readme_response = requests.get(readme_link, auth=(github_username, github_token))
        readme_response_code = readme_response.status_code
        if readme_response_code != 200:
            raise Exception(readme_response_code.status_code, readme_response_code.text)
            return 
    
        # pull the content out of the readme 
        readme_content = readme_response.content
        readme_soup = BeautifulSoup(readme_content, 'html.parser')
        readme_string = str(readme_soup)
    
        #give us the the timing and status 
        now = datetime.now()
        current_time = now.strftime("%Y-%m-%d %H:%M:%S")
        #print(html_content)
        return slug, readme_string, current_time, "Done"
    

if __name__ == "__main__":
    print("Started scraping")
    scrape_readmes(slug = 'brandonleekramer/diversity')
    print("Finished scraping")

Started scraping
Finished scraping


In [113]:
# connect to the database, download data 
connection = pg.connect(host = 'postgis1', database = 'sdad', 
                        user = os.environ.get('db_user'), 
                        password = os.environ.get('db_pwd'))

raw_slug_data = '''SELECT * FROM gh_2007_2020.repos_ranked where commits < 500 AND commits > 150 AND status != 'Init' '''

# convert to a dataframe, show how many missing we have (none)
raw_slug_data = pd.read_sql_query(raw_slug_data, con=connection)
raw_slug_data.head()

Unnamed: 0,id,spdx,slug,createdat,description,primarylanguage,branch,commits,asof,status
0,MDEwOlJlcG9zaXRvcnk0Njc0ODc5Ng==,GPL-2.0,jfbvm/planigle,2015-11-23 21:22:29,Automatically exported from code.google.com/p/...,Ruby,MDM6UmVmNDY3NDg3OTY6cmVmcy9oZWFkcy9tYXN0ZXI=,465,2021-01-03 18:56:55,Done
1,MDEwOlJlcG9zaXRvcnkyMDU4Mjg0MzE=,MIT,madscatter/gatsby-starter-netlify-cms,2019-09-02 10:00:38,,JavaScript,MDM6UmVmMjA1ODI4NDMxOnJlZnMvaGVhZHMvbWFzdGVy,465,2021-01-03 14:34:50,Done
2,MDEwOlJlcG9zaXRvcnkxNzI2NDY0MDM=,Apache-2.0,dm1090/friendly-chat,2019-02-26 05:47:44,Google's code lab with enhancements,JavaScript,MDM6UmVmMTcyNjQ2NDAzOnJlZnMvaGVhZHMvbWFzdGVy,465,2021-01-03 20:10:09,Done
3,MDEwOlJlcG9zaXRvcnkxNDE4Mzc4OTA=,MIT,Younes-Charfaoui/Daily-Coding-Problem,2018-07-21 18:44:57,Series of the problem 💯 and solution ✅ asked b...,Python,MDM6UmVmMTQxODM3ODkwOnJlZnMvaGVhZHMvbWFzdGVy,465,2021-01-03 21:45:05,Done
4,MDEwOlJlcG9zaXRvcnkzNjE2NTcwNQ==,MIT,Rplus/dev-pool,2015-05-24 09:50:47,,CSS,MDM6UmVmMzYxNjU3MDU6cmVmcy9oZWFkcy9tYXN0ZXI=,465,2021-01-03 22:22:35,Done


In [114]:
#raw_slug_data = raw_slug_data[raw_slug_data['slug'] != 'cjbd/src']
raw_slug_data['slug'].count()

4866

In [118]:
os.chdir('/project/class/bii_sdad_dspg/ncses_oss_2021/requests_scrape/')
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv = combined_csv[combined_csv['status'] == 'Done']
already_scraped_slugs = combined_csv['slug'].tolist()
filtered_slugs = ~raw_slug_data.slug.isin(already_scraped_slugs)
filtered_slugs = raw_slug_data[filtered_slugs]
filtered_slugs.head()
filtered_slugs['slug'].count()

1529

In [116]:
raw_slugs = filtered_slugs["slug"].tolist()
slugs = []
for s in raw_slugs:
    slugs.append(s.strip())

In [117]:
# need to change this for each batch or you will save over what you had  
batch_name = 'oss_readme_batch1_8' 
cores_available = multiprocessing.cpu_count() - 1
pool = Pool(cores_available)

slug_log = []
readme_log = []
asof_log = []
status_log = []
for result in pool.imap_unordered(scrape_readmes, slugs):
    slug_log.append(result[0])
    readme_log.append(result[1])
    asof_log.append(result[2])
    status_log.append(result[3])
    final_log = pd.DataFrame({'slug': slug_log, "readme_text": readme_log, 'batch': batch_name, 'as_of': asof_log, 'status': status_log}, 
                              columns=["slug", "readme_text", "batch", "as_of", "status"])
    final_log.to_csv('/project/class/bii_sdad_dspg/ncses_oss_2021/requests_scrape/'+batch_name+'.csv', sep=',', encoding='utf-8', index=False)
print("Finished scraping", len(final_log), "of", len(slugs), "records")

404 error on crtaylor243/forestry-eval
404 error on Mishagta567/Job4j_avasiliev2M
404 error on 55011212042/rsc
404 error on AxelTidehorn/The-wall
404 error on jc00ke/mongoose
404 error on mengzhuo/rsc
404 error on Timwi/KtaneStuff404 error on ericmbudd/wardogs

404 error on Meeds-io/gatein-sso
404 error on vahidma/TGRoot404 error on thadd3us/quickdic-dictionary.dictionary

404 error on batandwa/just-the-video
404 error on alsfghlasfhgafshg/huahui
404 error on idega/com.idega.block.text
404 error on Pholenk/siakad
404 error on juga0/doctor
404 error on amoschov/webstart-maven-plugin
404 error on THEGAYUK/news
404 error on aliyoussefi/BAST306T-Source
404 error on whentze/Power-Defense
404 error on codistmonk/TransFile
404 error on gonrin/GonrinUI
404 error on gustavosr98/UCAB07-BasesII-Proyecto
404 error on Gloryofthe80s/gatsby-starter-netlify-cms
404 error on Mioze7Ae/android_tools_repo
404 error on tudor-berariu/lifelong-learning
404 error on usco/web-slicer-ui
404 error on Katushe/pha

TypeError: 'NoneType' object is not subscriptable

### Batch Monitoring 

https://docs.google.com/spreadsheets/d/1GJNa0yt_CnBmMTIxKVVFJnZT9cmK7Lt0I7Id88LhQTQ/edit#gid=0

In [128]:
os.chdir('/project/class/bii_sdad_dspg/ncses_oss_2021/requests_scrape/')
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv = combined_csv[combined_csv['status'] == 'Done']
combined_csv = combined_csv.sort_values("batch")
combined_csv.count()
#combined_csv.to_csv('/project/class/bii_sdad_dspg/ncses_oss_2021/requests_scrape/oss_readme_aggregated/oss_readme_data_061521.csv', sep=',', encoding='utf-8', index=False)

slug           6207
readme_text    6189
batch          6207
as_of          6207
status         6207
dtype: int64

404 error on david95thinkcode/pharmawine-android
404 error on srujankumar/cell-structure
404 error on fletaio/deprecated_framework
404 error on zhangwei5095/commons-el
404 error on ASStoredProcedures/ASStoredProcedures


In [123]:
os.chdir('/project/class/bii_sdad_dspg/ncses_oss_2021/requests_scrape/')
check = pd.read_csv('oss_readme_batch1_8.csv')
check

Unnamed: 0,slug,readme_text,batch,as_of,status
0,shahabsaf1/copy,# [InfernalTG](https://telegram.me/TeleInferna...,oss_readme_batch1_8,2021-06-14 12:20:52,Done
1,stephan0992/week-1,# Jekyll Now\n\n**Jekyll** is a static site ge...,oss_readme_batch1_8,2021-06-14 12:20:52,Done
2,SynapseProject/handlers.ActiveDirectory.net,# handlers.ActiveDirectory.net\nActive Directo...,oss_readme_batch1_8,2021-06-14 12:20:52,Done
3,mraggi/discreture,[![Build Status](https://travis-ci.org/mraggi/...,oss_readme_batch1_8,2021-06-14 12:20:52,Done
4,karlstroetmann/Lineare-Algebra,Lineare-Algebra\n===============\n\nIn diesem ...,oss_readme_batch1_8,2021-06-14 12:20:52,Done
...,...,...,...,...,...
2174,pkalro/project8,# angular-seed â€” the seed for AngularJS apps...,oss_readme_batch1_8,2021-06-14 12:35:51,Done
2175,Tizzio/WifiTransfer,# WifiTransfer\nDirect wifi file transfer betw...,oss_readme_batch1_8,2021-06-14 12:35:51,Done
2176,PAPARA-ZZ-I/PAPARA-ZZ-I,# PAPARA(ZZ)I\nCopyright 2015-2017 Yann Marcon...,oss_readme_batch1_8,2021-06-14 12:35:51,Done
2177,Capital-T-Industries/docker-elk,"# The ELK stack (Elasticsearch, Logstash, Kiba...",oss_readme_batch1_8,2021-06-14 12:35:51,Done


In [217]:
test_slugs = ["brandonleekramer/diversity", 
              'cjbd/src'
              "uva-bi-sdad/oss-2020", 
              "facebook/react", 
              "RichardLitt/standard-readme",            
]

looped_data = []
for slug in test_slugs:
    results = scrape_readmes(slug)
    looped_data.append(results)
    final_data = pd.DataFrame(looped_data, columns=["slug", "readme_text", "as_of", "status"])

final_data 

brandonleekramer/diversity
404 error on slug cjbd/srcuva-bi-sdad/oss-2020
facebook/react
RichardLitt/standard-readme


Unnamed: 0,slug,readme_text,as_of,status
0,brandonleekramer/diversity,\n#### The Rise of Diversity and Population Te...,2021-06-11 16:38:42,Done
1,cjbd/srcuva-bi-sdad/oss-2020,404 ERROR - NO README,2021-06-11 16:38:42,Done
2,facebook/react,# [React](https://reactjs.org/) · [![GitHub li...,2021-06-11 16:38:42,Done
3,RichardLitt/standard-readme,# Standard Readme\n\n[![standard-readme compli...,2021-06-11 16:38:42,Done


In [None]:
batch_name = 'batch1_5000_commits' 

looped_data = []
for slug in slugs:
    results = scrape_readmes(slug)
    looped_data.append(results)
    final_data = pd.DataFrame(looped_data, columns=["slug", "readme_text", "as_of", "status"])
    final_data.to_csv('/project/class/bii_sdad_dspg/ncses_oss_2021/requests_scrape/'+batch_name+'.csv', sep=',', encoding='utf-8', index=True)

final_data 