In [1]:
import glob 
import re
import json
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def item_filter_2(tag):
    cond = tag.name == 'li'
    cond = cond and tag.find('a').get('href')
    cond = cond and 'pull' in cond.split('/')[-2] # look for Git pull requests
    return cond

In [3]:
def extract_state(row):
    symbol = [sym for sym in state_symbols if (sym in row)]
    if len(symbol) == 1:
        return state_map[symbol[0]]
    return ''

state_map = {
    '⚠': 'wip',
    '👀': 'needs review',
    '✔': 'done',
}

state_symbols = list(state_map.keys())

In [4]:
def extract_contributors(text):
    contributors = text.split('(')[-1].split(')')[0]
    if text == contributors:
        return ''
    return contributors

In [5]:
def clean_2(text):

    text = text.split(' ') # split text on spaces
    for t in text:
        # remove any state symbols
        if any (sym in t for sym in state_symbols):
            text.remove(t)
    
    del text[-1] # remove contributor 
    text = ' '.join(text) # join back into single string
    # Remove square brackets and anything inside
    pattern = r'\[.*?\]'
    text = re.sub(pattern, '', text)
    # Remove any token with '#'
    text = " ".join(filter(lambda x:x[0]!='#', text.split()))
    # Remove any 'PR'
    text = text.replace('PR', "")

    return text

In [6]:
def clean_file(filename):
    print(filename)
    with open(filename) as f:
        data = json.load(f)
                
    soup = BeautifulSoup(data['full'], 'html.parser')
    
    data = []
    for it in soup.find_all(item_filter_2):
        data.append({
            'text': it.text,
            'link': it.find('a').get('href'),
        })
    df = pd.DataFrame(data)
    
    df['state'] = df.text.apply(extract_state)

    df['contributors'] = df.text.apply(extract_contributors)

    df['text_cleaned'] = df.text.apply(clean_2)
    
    df.to_csv(r'./scrape_prs/{}.csv'.format(filename.split('/')[-1]), header=True, index=None, mode='w')
    
    return df

fail_c = 0
for filename in glob.glob('./scrape_raw/*.json'):
    try: 
        df = clean_file(filename)
    except:
        print('failed')
        fail_c += 1
        pass

./scrape_raw/DATA Digest from Mon, Jan 04 to Sun, Jan 10, 2021.json
./scrape_raw/PRODUCT Digest - Tue, Jun 08 to Mon, Jun 14, 2021.json
failed
./scrape_raw/PRODUCT Digest from Tue, Feb 02 to Mon, Feb 8, 2021.json
./scrape_raw/DATA Digest - Mon, Mar 08 to Sun, Mar 14, 2021.json
./scrape_raw/DATA Digest from Mon, Jan 18 to Sun, Jan 24, 2021.json
./scrape_raw/DATA Digest - Mon, May 03 to Sun, May 9, 2021.json
./scrape_raw/DATA Digest - Mon, Mar 01 to Sun, Mar 7, 2021.json
./scrape_raw/PRODUCT Digest from Tue, Jan 05 to Mon, Jan 11, 2021.json
./scrape_raw/PRODUCT Digest - Tue, Jul 06 to Mon, Jul 12, 2021.json
failed
./scrape_raw/PRODUCT Digest from Tue, Jan 19 to Mon, Jan 25, 2021.json
./scrape_raw/PRODUCT Digest - Tue, May 18 to Mon, May 24, 2021.json
./scrape_raw/PRODUCT Digest - Tue, May 04 to Mon, May 10, 2021.json
./scrape_raw/DESIGN Digest - Mon, Jul 19 to Sun, Jul 25, 2021.json
failed
./scrape_raw/PRODUCT Digest from Tue, Dec 15 to Mon, Dec 21, 2020.json
./scrape_raw/DESIGN Digest -

In [7]:
pd.set_option('display.max_rows', None)

fns = []
for filename in glob.glob('./scrape_prs/*.csv'): 
    fns.append(filename)
    
print('Failed to clean {} files.'.format(fail_c))
print('Cleaned {} files.'.format(len(fns)))
df = pd.read_csv(fns[20])
df

Failed to clean 27 files.
Cleaned 47 files.


Unnamed: 0,text,link,state,contributors,text_cleaned
0,✔️ PR #655 Geocohort insights fsa agg (kevlabs),https://github.com/EQWorks/overseer/pull/655,done,kevlabs,Geocohort insights fsa agg
1,✔️ PR #654 geo cohort fsa agg (eq-ianecc),https://github.com/EQWorks/overseer/pull/654,done,eq-ianecc,geo cohort fsa agg
2,✔️ PR #653 Customer/GeoCohort - Geo avail from...,https://github.com/EQWorks/overseer/pull/653,done,kevlabs,Customer/GeoCohort - Geo avail from table
3,👀 PR #1823 devops - upgrade build runtime and ...,https://github.com/EQWorks/overlord/pull/1823,needs review,woozyking,devops - upgrade build runtime and clean up d...
4,✔️ PR #1822 More optimal Geocohort API calls t...,https://github.com/EQWorks/overlord/pull/1822,done,woozyking,More optimal Geocohort API calls through SWR
5,✔️ PR #1821 FSA to postal codes transition for...,https://github.com/EQWorks/overlord/pull/1821,done,geoerika,FSA to postal codes transition for map
6,✔️ PR #1813 whitelabels - add new white label ...,https://github.com/EQWorks/overlord/pull/1813,done,woozyking,whitelabels - add new white label creation fe...
7,👀 PR #1808 geo cohort/reports (DoParkEQ),https://github.com/EQWorks/overlord/pull/1808,needs review,DoParkEQ,geo cohort/reports
8,✔️ PR #52 GeoCohort/Fix - Export handler (kevl...,https://github.com/EQWorks/chronos-lambda/pull/52,done,kevlabs,GeoCohort/Fix - Export handler
9,✔️ PR #50 GeoCohortAvail - Implement lambda (k...,https://github.com/EQWorks/chronos-lambda/pull/50,done,kevlabs,GeoCohortAvail - Implement lambda
