In [None]:
from github import Github
import pandas as pd 
import json
import itertools
import os
import requests
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx




##### Load personal token here to authenticate.

In [None]:
%store -r access_token
g = Github(access_token)

In [None]:
g.rate_limiting

#### These variables are global variables.

In [None]:
repo_var = None
page_num=None
repo_list=list()
list_without_lesson=list()
list_with_lesson=list()
final_repo_list=list()

#### Declaring a function

* To get the total repositories count
* Creating a page_num variable to hold the valid page numbers to iterate the repo list.
* Final repo list created from repo list
* List with lesson created from final repo list
* List without lesson created from final repo list

In [None]:
def repo_initialize():
    
    '''
    Function takes no parameter.
    Returning total repositories count, final repository list, list_with_lesson & list_without_lesson.
    
    '''
    
    
    global repo_var
    global page_num
    global repo_list
    global list_without_lesson
    global list_with_lesson
    global final_repo_list
    
    repo_var = g.get_organization("carpentries-incubator").get_repos()

    page_num = 0
    for i in range(0,10):
        if len(repo_var.get_page(i))==0:
            page_num = i
            break
        else:
            pass
    repo_list = list()
    for i in range(0,page_num):
        repo_list.append(repo_var.get_page(i))
    final_repo_list = list((itertools.chain.from_iterable(repo_list)))

# This program gives two Repository lists. 1)list_with_lesson & list_without_lesson.

    for repo in final_repo_list:
        tags=repo.get_topics()
        if "lesson" in tags:
            list_with_lesson.append(repo)
        else:
            list_without_lesson.append(repo)
            
repo_initialize()

##### Reading all the datasets to get the unique list of contributors( commit,PR and issue) of each repository .

In [None]:
def read_all_the_files(list_with_lesson):
    
    """
    This function reads all the json files pertaining to each repository.
    It iterates over each repository of list that contains repositories of lesson.
    
    Input:- list of repositories of lesson -list_with_lesson.
    
    Output:- It returns a dictionary that contains repository name as a key & list of contributor as a value.
    
    """
    
    
    contributors_count=dict()
    for repo in list_with_lesson:
        contributors=set()
        if not os.path.exists('data/'+repo.name+'.json'): 
            continue
        with open('data/'+repo.name+'.json','r') as f:
            try:
                fildata=json.load(f)
                
                commit_dict=fildata['commits_dict']
                
                for k in commit_dict.keys():
                    try:
                        contributors.add(commit_dict[k][0])
                        
                    except:
                        pass
                for pr in fildata['pr_dict']:
                    try:
                        contributors.add(pr['pr_name'])
                    except:
                        pass
                for issue in fildata['issue_dict']:
                    try:
                        contributors.add(issue['user_name'])
                    except:
                        pass
            except:
                pass
        contributors_count[repo.name]=contributors         
                


        
            
    return contributors_count
contri_data=read_all_the_files(list_with_lesson)     

#### Function to get the list of repositories of each contributors

In [None]:
def get_list_of_repo_for_each_contributors(contri_data):
    
    """
    This function gives the list of repositories of each contributors.
    Input:- It takes a dictionary as an argument that contains repository name as a key & list of contributor as a value.
    Output:- It returns a dictionary that contains contributor as a key and repsoitories as a value.
    

    """
    
    
    cdict=dict()
    for k_repo in contri_data.keys():
        #print(contri_data[k_repo])
        if contri_data[k_repo] is None:
            continue
        for contributor in contri_data[k_repo]:# clist is containing the all the contributors of K_repo.
        
            cdict[contributor]=set() # containing all contributors as a key and their corresponding list of repos to which they have contributed  in which they are ocntributing.
        
        for contributor in cdict.keys():
            for repo in contri_data.keys():
                if contributor in contri_data[repo]:
                    cdict[contributor].add(repo)
    return cdict

all_contributors_list=get_list_of_repo_for_each_contributors(contri_data)

        

#### Function count repositories of each contributor to which the user has contributed to.

In [None]:
def count_repo_of_each_contributor(all_contributors_list):

    """
    This functions count repositories of each contributor to which the user has contributed to.
    Input:- It takes a dictionary that contains contributor as a key and repositories as a value.
    Output:- It returns a dataframe of with two columns , 1st column has contributors name and 2nd column has the total count 
             repositories to which that contributor has contributed to.
    
    """
    
    
    contrilist=[]
    repocountlist=[]
    histo_df=pd.DataFrame()
    for k_repo in all_contributors_list.keys():
        contrilist.append(k_repo)
        repocountlist.append(len(all_contributors_list[k_repo]))
        
    return contrilist,repocountlist

contrilist,repocountlist=count_repo_of_each_contributor(all_contributors_list)

histo_df=pd.DataFrame()

histo_df['Contributor']=contrilist
histo_df['Repo_Count']=repocountlist


In [None]:
pd.set_option('display.max_rows', None)
histo_df


In [None]:
df8=histo_df

In [None]:
histo_df[histo_df['Repo_Count']==1].shape

### Function to plot the histogram.

In [None]:
def plot_histogram(histo_df):
    plt.style.use('seaborn-white')
    histo_df.hist(column='Repo_Count',bins=[1,2,3,4,5,6,10,15,20,25,30,35,40,50,60])
    plt.xlabel("Repo_count")
    plt.ylabel("Contributors")
    plt.title("Contribution of users to total number of repositories")
    plt.show()
    plt.tight_layout()
plot_histogram(histo_df)



# The number of lessons to which  each contributor has contributed can be derived from repository 
# list against each contributor.

# So the dictionary contri_data  has name of contributors as key and list of repository as the values.

# Our observation is the most contributor has contributed only repository,followed by those contributing to two repositories.



In [None]:
import networkx as nx

In [None]:
graph=nx.DiGraph()


#### Plotting the collaboration graph to understand collaboration between contributor.

In [None]:


def create_edges(all_contributors_list):
    """
    Create_edges function creates the connection between various contributors.
    If contributor is contributing one repo and another contributor is contributing to the same repo, we will create an edge 
    between them.
    
    I/o: The function takes all contributors list along with the repo list
    o/p: The function returns the list of tuples with contributors

    """
    edges=[] #list contains typles , each tuple has contributor-contributor mapping
    for contributor in all_contributors_list.keys(): #get contirbutor
        for repo in all_contributors_list[contributor]: #get repo of that contirbutor
            for contributor2 in all_contributors_list.keys(): # get another contributor
                if contributor != contributor2:
                    if repo in all_contributors_list[contributor2]: #check the repo of first contributor in the repo list of second contribot
                        if (repo is not None) and (contributor is not None) and (contributor2 is not None):
                            edges.append((contributor,contributor2)) #create and edge as a tuple and add it to list 
    return edges

edges=create_edges(all_contributors_list)
                    

In [None]:
all_contributors_list

In [None]:
print(len(all_contributors_list.keys()))

In [None]:
all_contributors_list

In [None]:
graph.add_edges_from(edges)


In [None]:
pos=nx.spring_layout(graph)


In [None]:
figsize=plt.figure(figsize=[20,20])
nx.draw_networkx_nodes(graph,pos,cmap=plt.get_cmap('jet'),node_size=10)
nx.draw_networkx_edges(graph,pos,edgelist=graph.edges(),arrows=False)



In [None]:
nx.is_strongly_connected(graph)