In [1]:
from bs4 import BeautifulSoup
import requests
import re
import json

In [2]:
author_dict = {'basic': {}, 'activity': {'summary': '', 'spec': {}}}

In [3]:
def requests_with_header(url):
    headers = {
        'user-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'
    }
    response = requests.get(url, headers=headers)    
    if response.status_code != 200:
        print(f'Response from {url} error.', response.status_code)
        return
    return response


def extract_author_basic_from_github(author_login):
    basic_dict = {'ava': '', 'name': '', 'id': '', 'slogan': '', 'followers': '', 'following': '', 'star': '', 'worksFor': '', 'homeLocation': '', 'email': '', 'url': '', 'twitter': '', 'highlight': [], 'Organizations': [], 'Sponsoring': [], 'pin_popular': []}
    basic_url = 'https://github.com/' + author_login
    res = requests_with_header(basic_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    # ava
    ava = soup.find('img', 'avatar avatar-user width-full border color-bg-primary')
    basic_dict['ava'] = ava['src']
    # name
    name = soup.find('span', 'p-name vcard-fullname d-block overflow-hidden')
    basic_dict['name'] = name.get_text().strip()
    # id
    user_id = soup.find('span', 'p-nickname vcard-username d-block')
    basic_dict['id'] = user_id.get_text().strip()
    # slogan
    slogan = soup.find('div', 'p-note user-profile-bio mb-3 js-user-profile-bio f4')
    basic_dict['slogan'] = slogan.get_text().strip()
    # followers following star
    follow = soup.find('div', 'flex-order-1 flex-md-order-none mt-2 mt-md-0')
    temp = [i for i in re.split('[following·followers \n]', follow.get_text().strip()) if i != '']
    basic_dict['followers'] = temp[0]
    basic_dict['following'] = temp[1]
    basic_dict['star'] = temp[2]
    # some
    card = soup.find('ul', 'vcard-details')
    for i in card.find_all('li'):
        basic_dict[i['itemprop']] = i.get_text().replace('\n', '').strip()
    # highlight
    highlight = soup.find('div', 'border-top color-border-secondary pt-3 mt-3 d-none d-md-block')
    for i in highlight.find_all('li', 'mt-2'):
        basic_dict['highlight'].append(i.get_text().strip())
    # other
    other = soup.find_all('div', 'border-top color-border-secondary pt-3 mt-3 clearfix hide-sm hide-md')
    for i in other:
        temp_name = i.get_text().replace('\n', ' ').strip()
        for j in i.find_all('img', 'avatar'):
            basic_dict[temp_name].append(j['alt'])
    # pin and popular
    #pin =  soup.find('ol', 'd-flex flex-wrap list-style-none gutter-condensed mb-4 js-pinned-items-reorder-list')
    pin_popular = soup.find('div', 'js-pinned-items-reorder-container')
    for i in pin_popular.find_all('li'):
        pin_dict = {'repo': '', 'desc': '', 'language': '', 'star': '', 'fork': ''}
        pin_dict['repo'] = i.find('div', 'd-flex width-full flex-items-center position-relative').get_text().replace('\n', '').strip()
        pin_dict['desc'] = i.find('p', 'pinned-item-desc text-gray text-small d-block mt-2 mb-3').get_text().replace('\n', '').strip()
        #temp_content = [j.get_text().replace('\n', '').strip() for j in i.find('p', 'mb-0 f6 text-gray').children if j != '\n']
        #pin_dict['language'] = temp_content[0]
        #pin_dict['star'] = temp_content[1]
        #pin_dict['fork'] = temp_content[2]
        basic_dict['pin_popular'].append(pin_dict)
    #print('*************************************************************')
    return basic_dict
    

def extract_author_activity_from_github(author_login, begin_year, begin_month, end_year, end_month):
    activity_dict = {}
    for a in range(begin_year, end_year + 1):
        if a == begin_year:
            start, end = begin_month, 13
        elif a == end_year:
            start, end = 1, end_month + 1
        else:
            start, end = 1, 13
        for b in range(start, end):
            if b < 10:
                activity_url = 'https://github.com/' + author_login + '?tab=overview&from=' + str(a) + '-0' + str(b) + '-01&to=' + str(a) + '-0' + str(b) + '-31'
            else:
                activity_url = 'https://github.com/' + author_login + '?tab=overview&from=' + str(a) + '-' + str(b) + '-01&to=' + str(a) + '-' + str(b) + '-31'
                
            res = requests_with_header(activity_url)
            soup = BeautifulSoup(res.text, 'html.parser')
            
            
            activity_time = times = str(a) + '-' + str(b)
            month_activity_dict = {'summary': [], 'spec': {}}
            
            # contributions
            for i in soup.find_all('div', 'TimelineItem'):
                j = BeautifulSoup(str(i))
                # summary
                summary = j.find('div', 'TimelineItem-body').find('span', 'color-text-primary')
                if summary == None:
                    #print('there is no activity of this type')
                    #print('*******************************')
                    continue
                month_activity_dict['summary'].append(' '.join([not_null for not_null in summary.get_text().strip().replace('\n', ' ').split(' ') if not_null != '']))

                # spec
                spec_activity = []
                spec = j.find('div', 'TimelineItem-body').find_all('li', 'ml-0 py-1 d-flex')
                for k in spec:
                    spec_activity.append(' '.join([not_null for not_null in k.get_text().strip().replace('\n', ' ').split(' ') if not_null != '']))

                spec = j.find('div', 'TimelineItem-body').find_all('li', 'd-flex py-1')
                for k in spec:
                    spec_activity.append(' '.join([not_null for not_null in k.get_text().strip().replace('\n', ' ').split(' ') if not_null != '']))

                spec = j.find('div', 'TimelineItem-body').find_all('details', 'Details-element details-reset my-2')
                for k in spec:
                    spec_activity.append(' '.join([not_null for not_null in k.get_text().strip().replace('\n', ' ').split(' ') if not_null != '']))

                spec = j.find('div', 'TimelineItem-body').find_all('details', 'Details-element details-reset my-1')
                for k in spec:
                    spec_activity.append(' '.join([not_null for not_null in k.get_text().strip().replace('\n', ' ').split(' ') if not_null != '']))

                activity_name = j.find('div', 'TimelineItem-badge').find('svg')['class'][1].split('octicon-')[1]
                if activity_name not in month_activity_dict['spec'].keys():
                    month_activity_dict['spec'][activity_name] = spec_activity
                #print('*******************************')
            if activity_time not in activity_dict.keys():
                activity_dict[activity_time] = month_activity_dict
    return activity_dict


In [4]:
author_dict = {'basic': extract_author_basic_from_github('hamelsmu'), 'activity': extract_author_activity_from_github('hamelsmu', 2020, 1, 2021, 2)}

ConnectionError: HTTPSConnectionPool(host='github.com', port=443): Max retries exceeded with url: /hamelsmu (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7ff5c977e970>: Failed to establish a new connection: [Errno 110] Connection timed out'))

In [None]:
print(author_dict)

In [17]:
with open('CAKGOD.json', 'w') as f:
    json.dump(author_dict, f)