In [1]:
import glob 
import os
import re
import json
import shutil
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def item_filter_2(tag):
    cond = tag.name == 'li'
    cond = cond and tag.find('a').get('href')
    cond = cond and 'pull' in cond.split('/')[-2] # look for Git pull requests
    return cond

In [3]:
def extract_state(row):
    symbol = [sym for sym in state_symbols if (sym in row)]
    if len(symbol) == 1:
        return state_map[symbol[0]]
    return ''

state_map = {
    '⚠': 'wip',
    '👀': 'needs review',
    '✔': 'done',
}

state_symbols = list(state_map.keys())

In [4]:
def extract_contributors(text):
    contributors = text.split('(')[-1].split(')')[0]
    if text == contributors:
        return ''
    return contributors

In [5]:
def clean_2(text):

    text = text.split(' ') # split text on spaces
    for t in text:
        # remove any state symbols
        if any (sym in t for sym in state_symbols):
            text.remove(t)
    
    del text[-1] # remove contributor 
    text = ' '.join(text) # join back into single string
    # Remove square brackets and anything inside
    pattern = r'\[.*?\]'
    text = re.sub(pattern, '', text)
    # Remove any token with '#'
    text = " ".join(filter(lambda x:x[0]!='#', text.split()))
    # Remove any 'PR'
    text = text.replace('PR', "")

    return text

In [6]:
def clean_file(filename):
    print(filename)
    with open(filename) as f:
        data = json.load(f)
                
    soup = BeautifulSoup(data['full'], 'html.parser')
    
    data = []
    for it in soup.find_all(item_filter_2):
        data.append({
            'text': it.text,
            'link': it.find('a').get('href'),
        })
    df = pd.DataFrame(data)
    
    df['state'] = df.text.apply(extract_state)

    df['contributors'] = df.text.apply(extract_contributors)

    df['text_cleaned'] = df.text.apply(clean_2)
    
    df.to_csv(r'./scraped_prs/aggregate_prs.csv'.format(filename.split('/')[-1]), header=True, index=None, mode='a')
    
    return df

In [7]:
# Main
success_c = 0
for filename in glob.glob('./scraped_raw/*.json'):
    try: 
        df = clean_file(filename)
        success_c += 1 
    except:
        print('failed')
        # Quarantine failures
        shutil.move(filename, './quarantine/')
        pass

./scraped_raw/DATA Digest from Mon, Jan 04 to Sun, Jan 10, 2021.json
./scraped_raw/PRODUCT Digest from Tue, Feb 02 to Mon, Feb 8, 2021.json
./scraped_raw/DATA Digest - Mon, Mar 08 to Sun, Mar 14, 2021.json
./scraped_raw/DATA Digest from Mon, Jan 18 to Sun, Jan 24, 2021.json
./scraped_raw/DATA Digest - Mon, May 03 to Sun, May 9, 2021.json
./scraped_raw/DATA Digest - Mon, Mar 01 to Sun, Mar 7, 2021.json
./scraped_raw/PRODUCT Digest from Tue, Jan 05 to Mon, Jan 11, 2021.json
./scraped_raw/PRODUCT Digest from Tue, Jan 19 to Mon, Jan 25, 2021.json
./scraped_raw/PRODUCT Digest - Tue, May 18 to Mon, May 24, 2021.json
./scraped_raw/PRODUCT Digest - Tue, May 04 to Mon, May 10, 2021.json
./scraped_raw/PRODUCT Digest from Tue, Dec 15 to Mon, Dec 21, 2020.json
./scraped_raw/DATA Digest - Mon, Apr 12 to Sun, Apr 18, 2021.json
./scraped_raw/DATA Digest - Mon, Feb 22 to Sun, Feb 28, 2021.json
./scraped_raw/DATA Digest from Mon, Dec 14 to Sun, Dec 20, 2020.json
./scraped_raw/PRODUCT Digest from Tue, D

In [8]:
# Display
pd.set_option('display.max_rows', None)

print('Cleaned {} out of {} files.'.format(success_c, len(list(os.listdir('./scraped_raw/')))))
df = pd.read_csv('./scraped_prs/aggregate_prs.csv')
df

Cleaned 47 out of 47 files.


Unnamed: 0,text,link,state,contributors,text_cleaned
0,👀 PR #68 Bday sign fix multi-bday logic (tlowa...,https://github.com/EQWorks/legion/pull/68,needs review,tlowande,Bday sign fix multi-bday logic
1,✔️ PR #67 diff - incorporate @eqworks/release ...,https://github.com/EQWorks/legion/pull/67,done,woozyking,diff - incorporate @eqworks/release parse commit
2,✔️ PR #64 Notes - make slack message link flex...,https://github.com/EQWorks/legion/pull/64,done,maluhoss,Notes - make slack message link flexible
3,✔️ PR #63 devops - yarn upgrade-interactive 20...,https://github.com/EQWorks/legion/pull/63,done,woozyking,devops - yarn upgrade-interactive 20210107
4,✔️ PR #9 github - include lone repo updates (s...,https://github.com/EQWorks/updates/pull/9,done,woozyking,github - include lone repo updates (sans-issu...
5,👀 PR #10 Weilin test/supun 1 (wwwmonsterlam),https://github.com/EQWorks/connector-google-an...,needs review,wwwmonsterlam,Weilin test/supun 1
6,👀 PR #9 Google analytics supun (wwwmonsterlam),https://github.com/EQWorks/connector-google-an...,needs review,wwwmonsterlam,Google analytics supun
7,✔️ PR #8 Google analytics - Handling multiple ...,https://github.com/EQWorks/connector-google-an...,done,supuntennakoon,Google analytics - Handling multiple reports ...
8,✔️ PR #24 enrich_demographic - fix check of da...,https://github.com/EQWorks/EnrichData/pull/24,done,bsandi1220,enrich_demographic - fix check of dataset str...
9,✔️ PR #23 get_polygon - fix geometry without d...,https://github.com/EQWorks/EnrichData/pull/23,done,bsandi1220,get_polygon - fix geometry without data
