In [1]:
import pandas as pd
import sys
import os 
import altair as alt

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file, read_combine_files
from data_generation_scripts.generate_repo_metadata import get_repo_owners
from data_generation_scripts.generate_user_repos_interactions import get_user_repo_activities
from data_generation_scripts.generate_user_users_interactions import get_user_users_activities

In [2]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv('../data/large_files/entity_files/repos_dataset.csv', low_memory=False)
subset_repo_df = pd.read_csv('../data/entity_files/subset_repos_dataset_with_commits.csv')

In [3]:

original_owners = user_df[user_df.login.isin(subset_repo_df['owner.login'])]

### Explore User Interactions


- contributors:
  - has user columns with connection to repos with repo_full_name
- forks:
  - has repo columns with connection to repos with repo_full_name, and to user with owner

In [4]:
contributors_df = pd.read_csv('../data/join_files/repo_contributors_join_dataset.csv')
forks_df = pd.read_csv('../data/join_files/repo_forks_join_dataset.csv')
stargazers_df = pd.read_csv('../data/join_files/repo_stargazers_join_dataset.csv')
subscribers_df = pd.read_csv('../data/join_files/repo_subscribers_join_dataset.csv')
user_followers = pd.read_csv('../data/join_files/user_followers_join_dataset.csv')
user_following = pd.read_csv('../data/join_files/user_following_join_dataset.csv')

In [5]:
issues_comments_df = pd.read_csv('../data/large_files/join_files/issues_comments_join_dataset.csv', low_memory=False)
pulls_comments_df = pd.read_csv('../data/large_files/join_files/pulls_comments_join_dataset.csv', low_memory=False)
repo_issues_df = pd.read_csv('../data/large_files/join_files/repo_issues_join_dataset.csv', low_memory=False)
repo_pulls_df = pd.read_csv('../data/large_files/join_files/repo_pulls_join_dataset.csv', low_memory=False)

### Explore Contributors

In [6]:
contributors_counts = contributors_df.login.value_counts().reset_index()
contributors_counts.columns = ['login', 'contributor_count']

alt.Chart(contributors_counts).mark_bar().encode(
    x=alt.X('contributor_count:Q'),
    y='count()'
)

In [10]:
top_contributors = contributors_counts[contributors_counts.contributor_count > 2]
len(top_contributors)

255

In [11]:
top_contributors_df = contributors_df[contributors_df.login.isin(top_contributors['login'])]

In [12]:
top_contributor_repos = top_contributors_df.repo_full_name.value_counts().reset_index()
top_contributor_repos.columns = ['repo_full_name', 'repo_contributor_count']

In [14]:
top_contributor_repos[top_contributor_repos.repo_contributor_count > 3].head(10)

Unnamed: 0,repo_full_name,repo_contributor_count
0,cms633/cms633.github.io,57
1,cms633/Fall-2019,56
2,cms633/Fall-2018,56
3,carpentries-incubator/sql-humanities-lesson,24
4,carpentries-incubator/spreadsheet-humanities-l...,23
5,ComputeCanada/dh-carpentry,22
6,carpentries-incubator/python-humanities-lesson,21
7,carpentries-incubator/OpenRefine-humanities-le...,20
8,UoMResearchIT/r-tidyverse-digital-humanities,20
9,programminghistorian/jekyll,19


In [17]:
# commits_df = pd.read_csv('../data/temp/repo_commits_join_dataset/programminghistorianjekyll_repo_actors_commits_url.csv')

: 

In [131]:
commits_df[(commits_df['commit.message'].str.contains('twitter|Twitter', na=False)) & (commits_df['author.login'] == 'ZoeLeBlanc')]['commit.message'].tolist()

['Merge pull request #2428 from programminghistorian/removing-twitter-commits\n\ncleaning up twitter commits hoping this time works',
 'Merge pull request #1976 from programminghistorian/issue-1975\n\nadded 400 errors to htmlproofer ignore list to stop twitter from brea…']

In [136]:
top_ph_committers = commits_df['commit.author.name'].value_counts().reset_index().head(20)

In [135]:
top_ph_contributors = contributors_df[contributors_df.repo_full_name.str.contains('programminghistorian/jekyll')].sort_values(by='contributions', ascending=False)[['login', 'contributions']].head(20)

In [138]:
top_ph_committers.columns = ['login', 'commit_count']

In [4]:
import time
from urllib.parse import parse_qs
import pandas as pd
import requests
import os
from tqdm import tqdm
import apikey
import sys
sys.path.append("..")
from data_generation_scripts.utils import *
import shutil


auth_token = apikey.load("DH_GITHUB_DATA_PERSONAL_TOKEN")

auth_headers = {'Authorization': f'token {auth_token}','User-Agent': 'request'}

In [7]:
url = subset_repo_df[0:1].contributors_url.values[0]

In [5]:
url = 'https://api.github.com/repos/wildmary/Portfolio-DH/commits'

In [6]:
response = requests.get(url, headers=auth_headers)

In [7]:
response_df = pd.json_normalize(response.json())

In [8]:
query = response_df[0:1].url.values[0]

In [9]:
query = response_df.url.values[0]

In [10]:
response_commit = requests.get(query, headers=auth_headers)

In [11]:
response_commit_df = pd.json_normalize(response_commit.json())

In [12]:
response_commit_df

Unnamed: 0,sha,node_id,url,html_url,comments_url,parents,files,commit.author.name,commit.author.email,commit.author.date,...,committer.subscriptions_url,committer.organizations_url,committer.repos_url,committer.events_url,committer.received_events_url,committer.type,committer.site_admin,stats.total,stats.additions,stats.deletions
0,a4cb55355ba8fbe6cf19691600d19524fc110aef,MDY6Q29tbWl0MjgzOTAzMjQxOmE0Y2I1NTM1NWJhOGZiZT...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '3bb3421fa786b5762d2cef6af5846ce0254a...,[{'sha': '666217c728b52486a3a55f0d406aec1a4ef7...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T02:24:21Z,...,https://api.github.com/users/web-flow/subscrip...,https://api.github.com/users/web-flow/orgs,https://api.github.com/users/web-flow/repos,https://api.github.com/users/web-flow/events{/...,https://api.github.com/users/web-flow/received...,User,False,1,1,0


In [40]:
join_cols = list(set(response_commit_df.columns) & set(response_df.columns))

In [35]:
set(response_commit_df.columns) - set(response_df.columns)

{'files', 'stats.additions', 'stats.deletions', 'stats.total'}

In [23]:
test = pd.read_csv('../data/temp/repo_commits_join_dataset/wildmaryPortfolio-DH_repo_actors_commits_url.csv')

In [24]:
test

Unnamed: 0,sha,node_id,url,html_url,comments_url,parents,commit.author.name,commit.author.email,commit.author.date,commit.committer.name,...,committer.site_admin,files,stats.total,stats.additions,stats.deletions,repo_id,repo_url,repo_html_url,repo_full_name,commits_url
0,a4cb55355ba8fbe6cf19691600d19524fc110aef,MDY6Q29tbWl0MjgzOTAzMjQxOmE0Y2I1NTM1NWJhOGZiZT...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '3bb3421fa786b5762d2cef6af5846ce0254a...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T02:24:21Z,GitHub,...,False,[{'sha': '666217c728b52486a3a55f0d406aec1a4ef7...,1,1,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
1,3bb3421fa786b5762d2cef6af5846ce0254a8bd1,MDY6Q29tbWl0MjgzOTAzMjQxOjNiYjM0MjFmYTc4NmI1Nz...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '88bca5394425f3ab43dd5e3acb54b01ad3f4...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T02:23:02Z,GitHub,...,False,[{'sha': '4bbb2fdf3f7c89bb7adcef3544f19223562e...,16,13,3,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
2,88bca5394425f3ab43dd5e3acb54b01ad3f4c3d4,MDY6Q29tbWl0MjgzOTAzMjQxOjg4YmNhNTM5NDQyNWYzYW...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '9142bdd3534f3d5145f048d24f0ea823c247...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T02:20:25Z,GitHub,...,False,[{'sha': '50a7a1202cd726d82010b2ecbc90932613d4...,0,0,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
3,9142bdd3534f3d5145f048d24f0ea823c247e20e,MDY6Q29tbWl0MjgzOTAzMjQxOjkxNDJiZGQzNTM0ZjNkNT...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': 'd2eb752a2e8e605c0395d4e5bd1b45a1ec03...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T02:14:34Z,GitHub,...,False,[{'sha': '9098d36566a2d0421859db64c3a82b30206c...,0,0,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
4,d2eb752a2e8e605c0395d4e5bd1b45a1ec03aae9,MDY6Q29tbWl0MjgzOTAzMjQxOmQyZWI3NTJhMmU4ZTYwNW...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '86057fe64bb634209a352237c0ab4481aad1...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T01:48:50Z,GitHub,...,False,[{'sha': 'f9cf0c83946bcda3b3653549d2395e230219...,14,14,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
5,86057fe64bb634209a352237c0ab4481aad17d36,MDY6Q29tbWl0MjgzOTAzMjQxOjg2MDU3ZmU2NGJiNjM0Mj...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '10558c74586b7475396db4760edb14d5d1e0...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T01:34:21Z,GitHub,...,False,[{'sha': '5b81fbb60a084fdf763abe6f4d22eb95917a...,3,3,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
6,10558c74586b7475396db4760edb14d5d1e0ac07,MDY6Q29tbWl0MjgzOTAzMjQxOjEwNTU4Yzc0NTg2Yjc0Nz...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '0552821a246a28502c8a7c289df100f09c7a...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T01:33:37Z,GitHub,...,False,[{'sha': 'dbf8289cc3c09c9f0acbfe13d4753cd26882...,15,15,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
7,0552821a246a28502c8a7c289df100f09c7a95ec,MDY6Q29tbWl0MjgzOTAzMjQxOjA1NTI4MjFhMjQ2YTI4NT...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': 'd3e64f6f28b0551e60b32c6e60da22cdfefe...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T01:30:26Z,GitHub,...,False,[{'sha': '2f4076f73e2c5e00a175c0cca023e89e1b42...,0,0,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
8,d3e64f6f28b0551e60b32c6e60da22cdfefe528d,MDY6Q29tbWl0MjgzOTAzMjQxOmQzZTY0ZjZmMjhiMDU1MW...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '436b95cfa867c8867e90f6b72e74ec688404...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T01:04:43Z,GitHub,...,False,[{'sha': '05c633952312fbe8ffc8d5ffd12149a9b651...,1,1,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...
9,436b95cfa867c8867e90f6b72e74ec688404a3ff,MDY6Q29tbWl0MjgzOTAzMjQxOjQzNmI5NWNmYTg2N2M4OD...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '92d50e814a98ab1c661052ee686fd099c2b3...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T01:04:05Z,GitHub,...,False,[{'sha': '7f3d83668fd725139746a2a1d3ad3b7c4681...,7,7,0,283903241.0,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH,wildmary/Portfolio-DH,https://api.github.com/repos/wildmary/Portfoli...


In [13]:
join_cols = list(set(response_commit_df.columns) - set(response_df.columns))
join_cols = join_cols + ['sha']
join_cols

['files', 'stats.total', 'stats.additions', 'stats.deletions', 'sha']

In [21]:
dfs = []
for _, row in response_df[0:2].iterrows():
    query = row.url
    response_commit = requests.get(query, headers=auth_headers)
    response_commit_df = pd.json_normalize(response_commit.json())
    # response_commit_df = response_commit_df[join_cols].to_dict()
    row['files'] = response_commit_df['files'].values[0]
    row['stats.total'] = response_commit_df['stats.total'].values[0]
    row['stats.additions'] = response_commit_df['stats.additions'].values[0]
    row['stats.deletions'] = response_commit_df['stats.deletions'].values[0]
    updated_df = pd.DataFrame([row.to_dict()])
    dfs.append(updated_df)

In [22]:
pd.concat(dfs)

Unnamed: 0,sha,node_id,url,html_url,comments_url,parents,commit.author.name,commit.author.email,commit.author.date,commit.committer.name,...,committer.organizations_url,committer.repos_url,committer.events_url,committer.received_events_url,committer.type,committer.site_admin,files,stats.total,stats.additions,stats.deletions
0,a4cb55355ba8fbe6cf19691600d19524fc110aef,MDY6Q29tbWl0MjgzOTAzMjQxOmE0Y2I1NTM1NWJhOGZiZT...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '3bb3421fa786b5762d2cef6af5846ce0254a...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T02:24:21Z,GitHub,...,https://api.github.com/users/web-flow/orgs,https://api.github.com/users/web-flow/repos,https://api.github.com/users/web-flow/events{/...,https://api.github.com/users/web-flow/received...,User,False,[{'sha': '666217c728b52486a3a55f0d406aec1a4ef7...,1,1,0
0,3bb3421fa786b5762d2cef6af5846ce0254a8bd1,MDY6Q29tbWl0MjgzOTAzMjQxOjNiYjM0MjFmYTc4NmI1Nz...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '88bca5394425f3ab43dd5e3acb54b01ad3f4...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T02:23:02Z,GitHub,...,https://api.github.com/users/web-flow/orgs,https://api.github.com/users/web-flow/repos,https://api.github.com/users/web-flow/events{/...,https://api.github.com/users/web-flow/received...,User,False,[{'sha': '4bbb2fdf3f7c89bb7adcef3544f19223562e...,16,13,3


In [45]:
pd.merge(test, response_commit_df[join_cols], on='sha', how='left')

Unnamed: 0,sha,node_id,url,html_url,comments_url,parents,commit.author.name,commit.author.email,commit.author.date,commit.committer.name,...,committer.organizations_url,committer.repos_url,committer.events_url,committer.received_events_url,committer.type,committer.site_admin,files,stats.deletions,stats.additions,stats.total
0,a4cb55355ba8fbe6cf19691600d19524fc110aef,MDY6Q29tbWl0MjgzOTAzMjQxOmE0Y2I1NTM1NWJhOGZiZT...,https://api.github.com/repos/wildmary/Portfoli...,https://github.com/wildmary/Portfolio-DH/commi...,https://api.github.com/repos/wildmary/Portfoli...,[{'sha': '3bb3421fa786b5762d2cef6af5846ce0254a...,Mary Levchenko,wildmary@yandex.ru,2020-07-31T02:24:21Z,GitHub,...,https://api.github.com/users/web-flow/orgs,https://api.github.com/users/web-flow/repos,https://api.github.com/users/web-flow/events{/...,https://api.github.com/users/web-flow/received...,User,False,[{'sha': '666217c728b52486a3a55f0d406aec1a4ef7...,0,1,1


In [21]:
response_df.to_dict()

{'login': {0: 'wildmary'},
 'id': {0: 38806620},
 'node_id': {0: 'MDQ6VXNlcjM4ODA2NjIw'},
 'avatar_url': {0: 'https://avatars.githubusercontent.com/u/38806620?v=4'},
 'gravatar_id': {0: ''},
 'url': {0: 'https://api.github.com/users/wildmary'},
 'html_url': {0: 'https://github.com/wildmary'},
 'followers_url': {0: 'https://api.github.com/users/wildmary/followers'},
 'following_url': {0: 'https://api.github.com/users/wildmary/following{/other_user}'},
 'gists_url': {0: 'https://api.github.com/users/wildmary/gists{/gist_id}'},
 'starred_url': {0: 'https://api.github.com/users/wildmary/starred{/owner}{/repo}'},
 'subscriptions_url': {0: 'https://api.github.com/users/wildmary/subscriptions'},
 'organizations_url': {0: 'https://api.github.com/users/wildmary/orgs'},
 'repos_url': {0: 'https://api.github.com/users/wildmary/repos'},
 'events_url': {0: 'https://api.github.com/users/wildmary/events{/privacy}'},
 'received_events_url': {0: 'https://api.github.com/users/wildmary/received_events'},

In [20]:
response_user_df.to_dict()

{'login': {0: 'wildmary'},
 'id': {0: 38806620},
 'node_id': {0: 'MDQ6VXNlcjM4ODA2NjIw'},
 'avatar_url': {0: 'https://avatars.githubusercontent.com/u/38806620?v=4'},
 'gravatar_id': {0: ''},
 'url': {0: 'https://api.github.com/users/wildmary'},
 'html_url': {0: 'https://github.com/wildmary'},
 'followers_url': {0: 'https://api.github.com/users/wildmary/followers'},
 'following_url': {0: 'https://api.github.com/users/wildmary/following{/other_user}'},
 'gists_url': {0: 'https://api.github.com/users/wildmary/gists{/gist_id}'},
 'starred_url': {0: 'https://api.github.com/users/wildmary/starred{/owner}{/repo}'},
 'subscriptions_url': {0: 'https://api.github.com/users/wildmary/subscriptions'},
 'organizations_url': {0: 'https://api.github.com/users/wildmary/orgs'},
 'repos_url': {0: 'https://api.github.com/users/wildmary/repos'},
 'events_url': {0: 'https://api.github.com/users/wildmary/events{/privacy}'},
 'received_events_url': {0: 'https://api.github.com/users/wildmary/received_events'},

In [166]:
user_df.columns

Index(['login', 'id', 'node_id', 'avatar_url', 'url', 'html_url',
       'followers_url', 'following_url', 'gists_url', 'starred_url',
       'subscriptions_url', 'organizations_url', 'repos_url', 'events_url',
       'received_events_url', 'type', 'site_admin', 'public_repos',
       'public_gists', 'followers', 'following', 'created_at', 'updated_at',
       'starred_at', 'gravatar_id', 'contributions', 'pushed_at'],
      dtype='object')

In [171]:
user_cols = pd.DataFrame(data=None, columns=response_user_df.columns)

In [173]:
user_cols.to_csv('../data/metadata_files/users_dataset_cols.csv', index=False)

In [169]:
set(user_df.columns.tolist()) & set(user_df.columns.tolist())

{'contributions', 'pushed_at', 'starred_at'}

In [144]:
user_df[0:1].to_dict()

{'login': {0: 'wildmary'},
 'id': {0: 38806620},
 'node_id': {0: 'MDQ6VXNlcjM4ODA2NjIw'},
 'avatar_url': {0: 'https://avatars.githubusercontent.com/u/38806620?v=4'},
 'url': {0: 'https://api.github.com/users/wildmary'},
 'html_url': {0: 'https://github.com/wildmary'},
 'followers_url': {0: 'https://api.github.com/users/wildmary/followers'},
 'following_url': {0: 'https://api.github.com/users/wildmary/following{/other_user}'},
 'gists_url': {0: 'https://api.github.com/users/wildmary/gists{/gist_id}'},
 'starred_url': {0: 'https://api.github.com/users/wildmary/starred{/owner}{/repo}'},
 'subscriptions_url': {0: 'https://api.github.com/users/wildmary/subscriptions'},
 'organizations_url': {0: 'https://api.github.com/users/wildmary/orgs'},
 'repos_url': {0: 'https://api.github.com/users/wildmary/repos'},
 'events_url': {0: 'https://api.github.com/users/wildmary/events{/privacy}'},
 'received_events_url': {0: 'https://api.github.com/users/wildmary/received_events'},
 'type': {0: 'User'},
 '

In [142]:
contributors_df[0:1].to_dict()

{'login': {0: 'wildmary'},
 'id': {0: 38806620},
 'node_id': {0: 'MDQ6VXNlcjM4ODA2NjIw'},
 'avatar_url': {0: 'https://avatars.githubusercontent.com/u/38806620?v=4'},
 'gravatar_id': {0: nan},
 'url': {0: 'https://api.github.com/users/wildmary'},
 'html_url': {0: 'https://github.com/wildmary'},
 'followers_url': {0: 'https://api.github.com/users/wildmary/followers'},
 'following_url': {0: 'https://api.github.com/users/wildmary/following{/other_user}'},
 'gists_url': {0: 'https://api.github.com/users/wildmary/gists{/gist_id}'},
 'starred_url': {0: 'https://api.github.com/users/wildmary/starred{/owner}{/repo}'},
 'subscriptions_url': {0: 'https://api.github.com/users/wildmary/subscriptions'},
 'organizations_url': {0: 'https://api.github.com/users/wildmary/orgs'},
 'repos_url': {0: 'https://api.github.com/users/wildmary/repos'},
 'events_url': {0: 'https://api.github.com/users/wildmary/events{/privacy}'},
 'received_events_url': {0: 'https://api.github.com/users/wildmary/received_events'}

In [139]:
pd.merge(top_ph_committers, top_ph_contributors, on='login', how='outer')

Unnamed: 0,login,commit_count,contributions
0,Adam Crymble,1490.0,
1,Matthew Lincoln,921.0,
2,Anisa Hawes,789.0,
3,Caleb McDaniel,397.0,
4,Jennifer Isasi,351.0,
5,Víctor Gayol,334.0,
6,spapastamkou,290.0,442.0
7,Daniel Alves,273.0,
8,Antonio Rojas Castro,270.0,
9,James Baker,221.0,


### Explore User Issues

In [95]:
issues_comments_counts = issues_comments_df['user.login'].value_counts().reset_index()
issues_comments_counts.columns = ['login', 'issues_comments_count']

alt.Chart(issues_comments_counts).mark_bar().encode(
    x=alt.X('issues_comments_count:Q'),
    y='count()'
)

In [104]:
top_issues_comments = issues_comments_counts[issues_comments_counts.issues_comments_count > 20]
len(top_issues_comments)

278

In [105]:
top_issues_comments.head(20)

Unnamed: 0,login,issues_comments_count
0,rlskoeser,3206
1,acrymble,2699
2,drjwbaker,1514
3,dependabot[bot],1441
4,mdlincoln,1392
5,benel,1331
6,gissoo,1002
7,thatbudakguy,865
8,walshbr,848
9,dependabot-preview[bot],771


In [107]:
top_issues_comments[top_issues_comments.login.str.contains('Zoe')]

Unnamed: 0,login,issues_comments_count
35,ZoeLeBlanc,265


In [106]:
top_issues_comments[top_issues_comments.login.isin(top_contributors['login'])]

Unnamed: 0,login,issues_comments_count
0,rlskoeser,3206
2,drjwbaker,1514
3,dependabot[bot],1441
5,benel,1331
6,gissoo,1002
7,thatbudakguy,865
8,walshbr,848
20,dwhieb,482
24,RJP43,410
25,blms,388


In [98]:
top_issues_comments_df = issues_comments_df[issues_comments_df['user.login'].isin(top_issues_comments['login'])]

In [99]:
top_issues_comments_repos = top_issues_comments_df['repo_full_name'].value_counts().reset_index()
top_issues_comments_repos.columns = ['repo_full_name', 'repo_issues_comments_count']

In [102]:
top_issues_comments_repos[top_issues_comments_repos.repo_issues_comments_count > 20]

Unnamed: 0,repo_full_name,repo_issues_comments_count
0,programminghistorian/jekyll,12764
1,Princeton-CDH/geniza,3369
2,Princeton-CDH/mep-django,2526
3,archivesunleashed/aut,1734
4,Hypertopic/Porphyry,1414
...,...,...
95,DH-Box/dh-usb,28
96,JonathanReeve/corpus-db,28
97,minicomp/wax_tasks,27
98,lmullen/americas-public-bible,23


In [None]:
user_dfs = []
repo_dfs = []
columns_dfs = []
dirs = ['../data/join_files', '../data/large_files/join_files']
for dir in dirs:
    for interal_dir, subdir, files in os.walk(dir):
        for f in files:
            if '.csv' in f:
                file_name = f.split('_join')[0]
                df = pd.read_csv(interal_dir + '/' + f, low_memory=False)
                col_df = pd.DataFrame([{'original_columns': df.columns.tolist(), 'file_type': file_name}])
                columns_dfs.append(col_df)
                if 'owner.login' in df.columns:
                    print('owner.login', file_name)
                if 'repo_full_name' in df.columns:
                    print('repo_full_name', file_name)
                if 'user_login' in df.columns:
                    print('user_login', file_name)
                if 'user.login' in df.columns:
                    print('user.login', file_name)
                # df['file_type'] = file_name
                # if 'repo_full_name' in df.columns :
                #     repo_dfs.append(df)
                # if 'user.login' in df.columns:
                #     user_dfs.append(df)

In [None]:
cols_df = pd.concat(columns_dfs)

In [None]:
cols_df.to_dict()

In [None]:
cols_df = cols_df.explode('original_columns')

In [None]:
cols_df.groupby(['original_columns', 'file_type']).size().reset_index().sort_values('original_columns')

In [None]:
user_join_df = pd.concat(user_dfs)
repo_join_df = pd.concat(repo_dfs)

In [None]:
user_join_df.columns.tolist()

In [None]:
stargazers_df = pd.read_csv('data/join_files/repo_stargazers_join_dataset.csv')

In [None]:
user_join_df.columns.tolist()

In [None]:
stargazers_df[0:1].to_dict()

In [None]:
user_join_df[0:1].to_dict()

In [None]:
repo_join_df[repo_join_df['user.following_url'].notna()].columns.tolist()