In [1]:
import json
import requests
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup

### 1. Scraping Number of Repositories, Stars and Followers

In [2]:
link = 'https://github.com/AshishJangra27'
res = requests.get(link)
soup = BeautifulSoup(res.text, 'html.parser')

try:
    sp = soup.find('nav', class_ = 'UnderlineNav-body width-full p-responsive').find_all('a')
    for i in sp:
        txt = i.text.replace('\n','').replace(' ','')
        if ('Repositories' in txt):
            repo =  int(txt[12:])   
        elif ('Stars' in txt):
            strs =  int(txt[5:])
except:
    repo = 0
    strs = 0
    
try:
    sp         = soup.find_all('a', class_ = 'Link--secondary no-underline no-wrap')
    followers  = int(sp[0].text.replace('\n','').replace(' ','')[:-9])
except:
    followers  = 0

### 2. Number of Pages Required to scrape all repositories

In [3]:
pages = int(repo/30)        # Pages with 30 repositories

if (repo % 30 != 0):
    pages += 1

### 3. Scraping Repository data from Multiple Pages

In [4]:
repository_dct = {}

for page in range(1,pages+1):
    
    res = requests.get(link + '?page=' + str(page) + '&tab=repositories')
    soup = BeautifulSoup(res.text, 'html.parser')

    for r in soup.find('div', id = 'user-repositories-list').find_all('li'):
        try:
            title = r.find('h3').find('a').text.replace('\n','').strip()
        except:
            title = np.nan
        try:
            licence   = r.find_all('span', class_ = 'mr-3')[-1].text.strip()    
        except:
            licence   = 'Unlicenced'
        try:
            strs      = r.find('a', class_ = 'Link--muted mr-3').text.strip()
        except:
            strs      = 0
        try:
            language  = r.find('span', itemprop = 'programmingLanguage').text
        except:
            language  = np.nan
        try:
            des       = r.find('p').text.strip()
        except:
            des       = 'Not available'   
        try:
            repo_link = 'https://github.com' + r.find('a').get('href')
        except:
            repo_link = np.nan
            
        
        download_link = repo_link + '/archive/refs/heads/main.zip'
        
        repository_dct[title] = {'repo_link' : repo_link, 'licence' : licence, 'stars' : strs, 
                             'description' : des, 'language' : language, 'download_link' : download_link}

### 4. Scraping Followers Data

In [5]:
followers_dct = {}

pages = int(followers/50)        # Pages with 30 repositories
if (followers % 50 != 0):
    pages += 1


for page in range(1,pages+1):

    res  = requests.get(link + '?page=' + str(page) + '&tab=followers')
    soup = BeautifulSoup(res.text, 'html.parser')
    sp = soup.find_all('div', class_ = 'd-table table-fixed col-12 width-full py-4 border-bottom color-border-muted')

    for f in sp:

        name_ = f.find_all('span')[0].text.strip()
        id_   = f.find_all('span')[1].text.strip()

        followers_dct[id_] = name_

### 5. Scraping Stargaze for Each Repository

In [6]:
stargaze = []

for k in tqdm(repository_dct.keys()):
    
    res = requests.get(repository_dct[k]['repo_link'] + '/stargazers')
    soup = BeautifulSoup(res.text, 'html.parser')
    
    stargaze = [i.find('h3').find('a').text for i in soup.find('ol').find_all('li')]
    
    repository_dct[k]['stargaze'] = stargaze

100%|███████████████████████████████████████████| 48/48 [00:26<00:00,  1.84it/s]


### 6. Combining Everything in One Dictionary

In [7]:
link = 'https://github.com/AshishJangra27'
res = requests.get(link)
soup = BeautifulSoup(res.text, 'html.parser')

try:
    img_link   = soup.find('img', style = 'height:auto;').get('src').split('?')[0]
except:
    img_link   = 'Not Available'
    
try:
    name       = soup.find('h1').find_all('span')[0].text.strip()
except:
    name       = 'Not Available'
    
try:
    user_id    = soup.find('h1').find_all('span')[1].text.strip()
except:
    user_id    = 'Not Available'
    
try:
    bio        = soup.find('div',class_ = 'p-note user-profile-bio mb-3 js-user-profile-bio f4').text.strip().replace('\n','')
except:
    bio        = 'Not Available'

try:
    sp         = soup.find_all('a', class_ = 'Link--secondary no-underline no-wrap')
    followers  = int(sp[0].text.replace('\n','').replace(' ','')[:-9])
    following  = int(sp[1].text.replace('\n','').replace(' ','')[:-9])
except:
    followers  = 'Not Available'
    following  = 'Not Available'
    
try:
    location   = soup.find('span',class_ = 'p-label').text.strip()
except:
    location   = 'Not Available'

try:
    sp = soup.find('nav', class_ = 'UnderlineNav-body width-full p-responsive').find_all('a')
    for i in sp:
        txt = i.text.replace('\n','').replace(' ','')
        if ('Repositories' in txt):
            repo =  int(txt[12:])   
        elif ('Projects' in txt):
            proj =  int(txt[8:])
        elif ('Packages' in txt):
            pkgs =  int(txt[8:])
        elif ('Stars' in txt):
            strs =  int(txt[5:])
except:
    repo = 0
    proj = 0
    pkgs = 0
    strs = 0

In [8]:
dct = {}

dct['name']      = name
dct['user_id']   = user_id
dct['bio']       = bio
dct['location']  = location
dct['followers'] = followers_dct
dct['repo']      = repository_dct
dct['img_link']  = img_link

### 7. Final Walkthrough

In [9]:
dct.keys()

dict_keys(['name', 'user_id', 'bio', 'location', 'followers', 'repo', 'img_link'])

### 8. Saving the JSON

In [10]:
txt = json.dumps(dct)

fd = open('ashish.json','w')
fd.write(txt)
fd.close()