# Exploring all Github repositories with topic "digital-humanities"

Top level exploration of repositories metadata:
- Date of creation
  - activity?
  - frequency of commits
- Number of contributors (tbd)
  - number of contributions
- Length of description
- How many other topics on the repo
- What language in the repo
- How many forks
- How many PRs and how many issues
  - frequency of issues
- Wikis?

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.renderers.enable('mimetype')
import os
import sys

sys.path.append("..")
from data_generation_scripts.get_repo_data import check_rate_limit, get_dh_repos_data, get_repo_contributors, get_repos_commits, check_total_pages, get_search_api_data
sys.path.append("..")
from data_generation_scripts.generate_search_data import generate_dh_queries, get_search_df
from data_generation_scripts.generate_language_data import get_repo_languages

In [2]:
rates_df = check_rate_limit()

In [3]:
# initial_output_path = '../data/repo_data/'
# df = generate_dh_queries(initial_output_path, rates_df)


In [4]:
repo_df = get_search_df()

In [5]:
output_path = "../data/combined_search_tagged_dh_repos_with_languages.csv"
repo_df = get_repo_languages(repo_df, output_path, rates_df)

Getting Languages: 100%|██████████| 2051/2051 [10:17<00:00,  3.32it/s]  


In [None]:
# repo_df = get_dh_repos_data('../data/repos_topic_dh.csv', rates_df)

In [None]:
contributors_df = get_repo_contributors(repo_df, '../data/repos_topic_dh_contributors.csv', rates_df)

In [None]:
commits_df = get_repos_commits(repo_df, '../private_data/repos_topic_dh_commits.csv', rates_df)

In [None]:
print(f"Number of identified repositories with the topic digital humanities: {len(repo_df)}")

### Date of Repo Creation

In [None]:
alt.Chart(repo_df).mark_bar().encode(
    x=alt.X("yearmonth(created_at):T", axis=alt.Axis(title="Date")),
    y=alt.Y("count()", axis=alt.Axis(title="")),
    color=alt.Color("yearmonth(created_at):T", legend=None, scale=alt.Scale(scheme='plasma')),
).properties(
    title="Frequency of DH Topic Repositories Created by Year and Month",
)

In [None]:
subset_df = repo_df[['forks_count', 'stargazers_count', 'watchers_count', 'size', 'html_url', 'created_at', 'full_name']]

In [None]:
subset_df['year'] = pd.to_datetime(subset_df['created_at']).dt.strftime('%Y')

In [None]:
cols = ['forks_count', 'stargazers_count', 'watchers_count', 'size']
reverse_cols = cols[::-1]

In [None]:
alt.Chart(subset_df).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color=alt.Color('year:N', scale=alt.Scale(scheme='plasma')),
    tooltip=['year:N', 'html_url:N', 'created_at:N', 'full_name:N'] 
).properties(
    width=125,
    height=125
).repeat(
    row=cols,
    column=reverse_cols
)

In [None]:
alt.Chart(repo_df).mark_bar().encode(
    y='count()',
    x='forks',
)

In [None]:
alt.Chart(repo_df).mark_bar().encode(
    y='count()',
    x='stargazers_count',
)