In [1]:
import re, os
import unicodedata
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment

from requests import get
from bs4 import BeautifulSoup

from wordcloud import WordCloud


plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

### This `fetch_user_followers_and_repos` function is to query a user's followers and the first 30 repositiories that they have publicly available on Github

In [2]:
def fetch_user_followers_and_repos(user: str):
    '''Takes a Github username as a string and will parse their github for followers
    and the users first upto 30 publicly available repos and returns 2 lists the followers and repositories
    '''
    # Set headers
    headers= {'User-Agent': 'Codeup Data Science'}
    # Set the parse_followers flag to True
    parse_followers = True
    # Start with page 1
    page = 1
    # Build a list to hold all the parsed followers
    all_users_followers = list()
    
    # Run through all pages of followers until there are not any more followers
    while parse_followers:
        # Runs to pull all followers from the user
        # Builds follower url to find the users follower's 
        follower_url = f'https://github.com/{user}?page={page}&tab=followers'
        print(follower_url)
        # Fetch response from url
        response = get(follower_url, headers=headers)

        # Return the user's followers html page
        follower_html = str(BeautifulSoup(response.text, 'html.parser'))

        # This regex pulls to pull followers usernames out of the html page
        # set as a set so it doesn't repeat users and remove the first char spot for the '/'
        user_followers = set([r[1][1:] for r in re.findall(r'(link_type\:self"\shref=")(.*?)"', follower_html)])
        
        # Add the new list of followers to the all_users_followers
        all_users_followers.extend(user_followers)
        
        # Check to see if there are not any more followers
        if not user_followers:
            parse_followers = False
        # Move onto the next page   
        page += 1
    
    # Build users repos
    repo_url = f'https://github.com/{user}?tab=repositories'
    
    # get the response from github repos
    response = get(repo_url, headers=headers) 
    
    # Return the user's repo html page
    repo_str = str(BeautifulSoup(response.text, 'html.parser'))
    
    # Filter the first page of repos and put into a set
    repos = set([r[1] for r in re.findall(
        r'(itemprop="name\scodeRepository">\n)\s*(.*?)<', repo_str)])
    
    return all_users_followers, repos

## Next we need to send the usernames to the function to pull all the information out

In [3]:
def add_followers_to_dataframe(df, followers: list):
    '''Takes a dataframe and a list of followers and checks if the followers are in the dataframe,
    if they are not, they will be added and set their default parsed value to False, returns
    amended dataframe
    '''
    # pull the followers from the dataframe and set to a list to search later
    followers_in_dataframe = df.index.to_list()
    
    # Iterate through the followers
    for follower in followers:
        # Check if the follower is in the dataframe
        if follower not in followers_in_dataframe:
            # Set the default parsed value to False
            df.loc[follower] = {'parsed': False}
    # Return the amended dataframe
    return df

In [4]:
def add_repos_to_readme_dataframe(user:str, repos:list):
    '''Takes the current user and the repos to search through and see if they are in the readme_dataframe index.
    If the repo is not in the readme_dataframe, it will add it to the index to pull the readme out later.
    '''
    # Set the filename for the readme
    filename = 'readme_data_c.csv'
    
    # Change the repos to a list so it can be scripted
    repos = list(repos)
    
    # Define the default columns for the dataframe
    default_cols ={
            'parsed': False, # Default parsed is False
            'readme': 'None', # Default readme is 'None'
            'programming_language': 'None' # Default PREDOMINANT programming language is 'None'
        }
    
    # Checks to see if the readme exists
    if os.path.exists(filename):
        # pull the readme_df in and set the index col to the user_repo
        readme_df = pd.read_csv(filename, index_col='user_repo')
    # If the readme_df does not exist, create it and set the default features
    else:
        # Add the user_repo to the default cols to set as index
        default_cols['user_repo'] = user + '/' + repos[0]
        # Define the dataframe
        readme_df = pd.DataFrame([default_cols]).set_index('user_repo')
        # Remove from default_cols so no key errors
        default_cols.pop('user_repo')
    
    # Iterate though the repos and check if they are in the readme_dataframe or not
    for repo in list(repos):
        # Combine user with repo to define the target index
        ind = user + '/' + repo
        # If ind is not in the readme file add it to it with the default cols
        if ind not in readme_df.index.to_list():
            # Add the user_repo to the readme with default_cols
            readme_df.loc[ind] = default_cols
    # Save the readme_df 
    readme_df.to_csv(filename)
    

In [9]:
def crawl_github(target: str, reparse=False, readme_cutoff=1000):
    '''Specify user to start a crawl and like to have run looking for
    user/repos, and README.md files
    '''
    # Establish users data that was crawled and if it was parsed or not
    parsed_users_file = 'users_data_c.csv'
    
    # Try to fetch readme_data_c to see how many entries there are to check against cutoff
    try:
        # Check the readme length
        readme_len = len(pd.read_csv('readme_data_c.csv', index_col=['user_repo']))
        
    except:
        # if the file does not exist set the readme length to 0
        readme_len = 0
    
    # Show the number of README Destinations
    print('Current number of README destinations is:', readme_len)

    # Check if there is a file for that site
    if os.path.exists(parsed_users_file):
        # Pull the dataframe in
        df = pd.read_csv(parsed_users_file, index_col='user')
        # check if the target is in the dataframe
        if reparse:
            df.loc[target, 'parsed'] = False
        
        # Check if user is in the list
        elif target not in df.index.to_list():
            # Add target to the dataframe
            df.loc[target] = {'parsed': False}
        # Ensure the number of readme files is greater than the cutoff
        elif readme_len >= readme_cutoff:
            # If greater than the cutoff return the dataframe
            return df
            
    else:
        # If the dataframe does not exist, set the first value to the target
        # Set the user and set the user as the index
        df = pd.DataFrame([{'user': target, 'parsed': False}]).set_index('user')
            
    # Ensure there are no more users to parse
    while len(df[~df.parsed].parsed.to_list()) != 0 and readme_len < readme_cutoff:
        # Pull the dataframe in again to ensure it's fresh each iteration
        if os.path.exists(parsed_users_file):
        # Pull the dataframe in if it's not the first time
            if reparse:
                # Ensure the user is set to not parsed if reparse is True
                df.loc[target, 'parsed'] = False
                # Set reparse flag to False
                reparse = False
                
            # If not reparse and the user is not in the index
            elif target not in df.index.to_list():
                # Add user to the dataframe with parsed to False
                df.loc[target] = {'parsed': False}
            
        # Set the user to parse as the first element in the list of NON parsed users
        user = df[~df.parsed].index[0]

        # Returns a list of followers and a list of repositories
        followers, repos = fetch_user_followers_and_repos(user)
        
        # Have function check if any of the followers are in the dataframe already
        df = add_followers_to_dataframe(df, followers)
        
        # Send current user and first 30 repos to be added to readme_df
        add_repos_to_readme_dataframe(user, repos)
        
        # Get the number of readme entries and check against cutoff
        readme_len = len(pd.read_csv('readme_data_c.csv', index_col=['user_repo']))
        
        print('Current number of README destinations is:', readme_len)
        
        # Set the current user parsed to True to iterate to next user
        df.loc[user] = {'parsed': True}
    
        # Save the dataframe so that it can continue to go through and check each url
        df.to_csv(parsed_users_file)
        # Show that the user was successfully parsed
        print(f'Parsed {user}')
        
    # Return dataframe of followers when done
    return df

In [None]:
df = crawl_github('abhisheknaiidu', readme_cutoff=10000)
df

Current number of README destinations is: 2023
https://github.com/Doni-zete?page=1&tab=followers
https://github.com/Doni-zete?page=2&tab=followers
Current number of README destinations is: 2053
Parsed Doni-zete
https://github.com/QiangZiBro?page=1&tab=followers
https://github.com/QiangZiBro?page=2&tab=followers
Current number of README destinations is: 2083
Parsed QiangZiBro
https://github.com/riicardoas?page=1&tab=followers
https://github.com/riicardoas?page=2&tab=followers
Current number of README destinations is: 2091
Parsed riicardoas
https://github.com/adrian-unmsm14?page=1&tab=followers
https://github.com/adrian-unmsm14?page=2&tab=followers
Current number of README destinations is: 2091
Parsed adrian-unmsm14
https://github.com/AbhijithGanesh?page=1&tab=followers
https://github.com/AbhijithGanesh?page=2&tab=followers
Current number of README destinations is: 2121
Parsed AbhijithGanesh
https://github.com/BhathiyaTK?page=1&tab=followers
https://github.com/BhathiyaTK?page=2&tab=follo

https://github.com/ckinleydavis?page=2&tab=followers
Current number of README destinations is: 2873
Parsed ckinleydavis
https://github.com/henryyohn?page=1&tab=followers
https://github.com/henryyohn?page=2&tab=followers
Current number of README destinations is: 2903
Parsed henryyohn
https://github.com/recallwei?page=1&tab=followers
https://github.com/recallwei?page=2&tab=followers
Current number of README destinations is: 2916
Parsed recallwei
https://github.com/argha-sarkar?page=1&tab=followers
Current number of README destinations is: 2923
Parsed argha-sarkar
https://github.com/derekmartinjs?page=1&tab=followers
https://github.com/derekmartinjs?page=2&tab=followers
Current number of README destinations is: 2949
Parsed derekmartinjs
https://github.com/Aarush-Goyal?page=1&tab=followers
https://github.com/Aarush-Goyal?page=2&tab=followers
https://github.com/Aarush-Goyal?page=3&tab=followers
https://github.com/Aarush-Goyal?page=4&tab=followers
https://github.com/Aarush-Goyal?page=5&tab=f

Current number of README destinations is: 3730
Parsed Brunosilva7
https://github.com/Gayathri-Chennakrishnam-Sharma?page=1&tab=followers
https://github.com/Gayathri-Chennakrishnam-Sharma?page=2&tab=followers
Current number of README destinations is: 3746
Parsed Gayathri-Chennakrishnam-Sharma
https://github.com/ema9123?page=1&tab=followers
https://github.com/ema9123?page=2&tab=followers
Current number of README destinations is: 3747
Parsed ema9123
https://github.com/pagarevijayy?page=1&tab=followers
https://github.com/pagarevijayy?page=2&tab=followers
Current number of README destinations is: 3777
Parsed pagarevijayy
https://github.com/rahmancoder?page=1&tab=followers
https://github.com/rahmancoder?page=2&tab=followers
Current number of README destinations is: 3797
Parsed rahmancoder
https://github.com/isilreiberth?page=1&tab=followers
https://github.com/isilreiberth?page=2&tab=followers
Current number of README destinations is: 3818
Parsed isilreiberth
https://github.com/tantiich?page=

Current number of README destinations is: 4526
Parsed shiory602
https://github.com/veena456?page=1&tab=followers
https://github.com/veena456?page=2&tab=followers
Current number of README destinations is: 4526
Parsed veena456
https://github.com/yamicoder?page=1&tab=followers
https://github.com/yamicoder?page=2&tab=followers
Current number of README destinations is: 4526
Parsed yamicoder
https://github.com/sadafamininia99?page=1&tab=followers
https://github.com/sadafamininia99?page=2&tab=followers
https://github.com/sadafamininia99?page=3&tab=followers
https://github.com/sadafamininia99?page=4&tab=followers
Current number of README destinations is: 4556
Parsed sadafamininia99
https://github.com/anupshastri96?page=1&tab=followers
Current number of README destinations is: 4580
Parsed anupshastri96
https://github.com/itskaustubh?page=1&tab=followers
https://github.com/itskaustubh?page=2&tab=followers
Current number of README destinations is: 4599
Parsed itskaustubh
https://github.com/kiran-

Current number of README destinations is: 5075
Parsed surajyadavtk
https://github.com/Raju-08562?page=1&tab=followers
https://github.com/Raju-08562?page=2&tab=followers
Current number of README destinations is: 5089
Parsed Raju-08562
https://github.com/saketh12072002?page=1&tab=followers
https://github.com/saketh12072002?page=2&tab=followers
Current number of README destinations is: 5099
Parsed saketh12072002
https://github.com/Ankan1998?page=1&tab=followers
https://github.com/Ankan1998?page=2&tab=followers
Current number of README destinations is: 5129
Parsed Ankan1998
https://github.com/MuriloAredes?page=1&tab=followers
https://github.com/MuriloAredes?page=2&tab=followers
Current number of README destinations is: 5134
Parsed MuriloAredes
https://github.com/DivyaS3919?page=1&tab=followers
Current number of README destinations is: 5134
Parsed DivyaS3919
https://github.com/schmithvillers?page=1&tab=followers
https://github.com/schmithvillers?page=2&tab=followers
Current number of README