# Exploring all Github repositories with topic "digital-humanities"

Top level exploration of repositories metadata:
- Date of creation
  - activity?
  - frequency of commits
- Number of contributors (tbd)
  - number of contributions
- Length of description
- How many other topics on the repo
- What language in the repo
- How many forks
- How many PRs and how many issues
  - frequency of issues
- Wikis?

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.renderers.enable('mimetype')
import os
import sys

sys.path.append("..")
from data_generation_scripts.utils import check_rate_limit, check_return_error_file
from data_generation_scripts.generate_search_data import get_initial_repo_df, combine_search_df
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags, get_repos_user_actors
from data_generation_scripts.generate_commits_data import get_repos_commits
from data_generation_scripts.generate_contributor_data import get_repo_contributors

In [2]:
rates_df = check_rate_limit()

In [3]:
initial_output_path = '../data/repo_data/'
repo_output_path = '../data/repos_dataset.csv'
join_output_path = "../data/search_queries_join_dataset.csv"
load_existing_data = True
repo_df, search_queries_repo_df = get_initial_repo_df(repo_output_path, join_output_path, initial_output_path, rates_df, load_existing_data)


In [4]:
print(f"From {len(search_queries_repo_df['query'].unique())} unique queries with results, we found {len(search_queries_repo_df)} repos, of which {len(repo_df)} are unique.")

From 41 unique queries with results, we found 2190 repos, of which 2104 are unique.


### Explore distribution of search results
How often do users tag and how often do they not tag their repos DH?

*Also worth noting we don't grab DH even though that's likely to be a common term*

In [5]:
all_queries = search_queries_repo_df['query'].unique().tolist()
final_queries = []
for query in all_queries:
    if 'topic' in query:
        cleaned_query = query.split('topic')[-1].split('&')[0].replace(':', '').replace('%3A', '').replace('-',' ') + ' (topic)'
    if 'topic' not in query:
        cleaned_query = query.split('?q=')[-1].split('%22+created')[0].split('&per_page')[0].replace('%22', '').replace('+', ' ') + ' (search)'
    final_queries.append({'query': query, 'cleaned_query': cleaned_query})
final_queries_df = pd.DataFrame(final_queries)
combined_queries = pd.merge(search_queries_repo_df, final_queries_df, on='query', how='left')
    

In [6]:
queries_distribution = combined_queries.cleaned_query.value_counts().rename_axis('queries').reset_index(name='counts')

In [7]:
import json
import codecs
dh_df = pd.DataFrame([json.load(codecs.open('../data/en.Digital humanities.json', 'r', 'utf-8-sig'))])
dh_df = dh_df.melt()
dh_df.columns = ['language', 'dh_term']
# Combine German and English terms because of identical spelling (should maybe make this a programatic check)
dh_df.loc[dh_df.language == 'de', 'language'] = 'de_en'

In [8]:
dh_df_topics = dh_df.copy()
dh_df_search = dh_df.copy()
dh_df_topics['queries'] = dh_df_topics['dh_term'] + ' (topic)'
dh_df_search['queries'] = dh_df_search['dh_term'] + ' (search)'

dh_df_topics = dh_df_topics.drop(columns=['dh_term'])
dh_df_search = dh_df_search.drop(columns=['dh_term'])
concat_dh_df = pd.concat([dh_df_topics, dh_df_search])

In [9]:
merged_df = pd.merge(queries_distribution, concat_dh_df, on='queries', how='outer')

In [10]:
merged_df.counts.fillna(0, inplace=True)

In [11]:
merged_df.loc[merged_df.queries == 'digital humanities (topic)', 'language'] = 'de_en'
merged_df.loc[merged_df.queries == 'humanidades digitales (topic)', 'language'] = 'es'
merged_df.loc[merged_df.queries == 'humanites numeriques (topic)', 'language'] = 'fr'
merged_df.loc[merged_df.queries == 'digital humanities centers (topic)', 'language'] = 'de_en'
merged_df.loc[merged_df.queries == 'digital public humanities (topic)', 'language'] = 'de_en'

In [12]:

lang_options = pd.read_csv('../data/repo_data/iso_639_choices.csv')

In [13]:
merged_df = pd.merge(merged_df, lang_options, on='language', how='left')

In [14]:
merged_df.loc[merged_df.language == 'sh', 'name'] = 'Serbo-Croatian'
merged_df.loc[merged_df.language == 'de_en', 'name'] = 'German and English'

In [15]:
all_data = alt.Chart(merged_df).mark_bar().encode(
    y=alt.Y('queries', sort='-x'),
    x='counts',
    color=alt.Color('name', scale=alt.Scale(scheme='plasma'), legend=alt.Legend(title='Language'))
)

non_eng = alt.Chart(merged_df[merged_df.counts < 500]).mark_bar().encode(
    y=alt.Y('queries', sort='-x'),
    x='counts',
    color=alt.Color('name', scale=alt.Scale(scheme='plasma'), legend=alt.Legend(title='Language'))
)

all_data | non_eng

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


#### Explore cooccurrence of terms

In [16]:
grouped_repos = combined_queries.groupby('full_name').size().reset_index(name='counts').sort_values('counts', ascending=False)
grouped_list = grouped_repos[grouped_repos['counts'] > 1].full_name.tolist()
freq_queries = combined_queries[combined_queries['full_name'].isin(grouped_list)].groupby('full_name')['cleaned_query'].apply(list).reset_index()

In [17]:
exploded_df = freq_queries.explode('cleaned_query')
grouped_exploded = exploded_df.groupby(['full_name','cleaned_query']).size().reset_index(name='val')
pivoted_df = grouped_exploded.pivot(index='full_name', columns='cleaned_query', values='val').fillna(0).astype(int)
final_df = pivoted_df.T.dot(pivoted_df)

In [18]:
reset_final = final_df.reset_index().rename_axis(None, axis=1)
cols = reset_final.columns
cols = cols[1:]
melted_df = pd.melt(reset_final, id_vars=['cleaned_query'], value_vars=cols)

In [19]:
top_results = alt.Chart(melted_df).mark_rect().encode(
    x='cleaned_query',
    y='variable',
    color='value'
)

rest_results = alt.Chart(melted_df[melted_df.value < 10]).mark_rect().encode(
    x='cleaned_query',
    y='variable',
    color='value'
)

alt.hconcat(top_results, rest_results).resolve_scale(color='independent')

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### Get Contributors

In [20]:
get_url_field = 'contributors_url'
load_existing_data = True
is_stargazers = False
contributors_df, users_df = get_repos_user_actors(repo_df, '../data/repo_contributors_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
contributors_errors_df = check_return_error_file('../data/error_logs/repo_contributors_join_dataset_errors.csv')

In [21]:
print(f"From {len(repo_df)} repos, we found {len(contributors_df)} contributors, of which {len(users_df[users_df.login.isin(contributors_df.login)])} are unique. There were {len(contributors_errors_df)} errors in getting contributors (likely user accounts that no longer exist).")

From 2104 repos, we found 4685 contributors, of which 3454 are unique. There were 67 errors in getting contributors (likely user accounts that no longer exist).


In [22]:
get_url_field = 'stargazers_url'
load_existing_data = True
is_stargazers = True
stargazers_df, users_df = get_repos_user_actors(repo_df, '../data/repo_stargazers_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
stargazers_errors_df = check_return_error_file('../data/error_logs/repo_stargazers_join_dataset_errors.csv')

In [23]:
print(f"From {len(repo_df)} repos, we found {len(stargazers_df)} stargazers, of which {len(users_df[users_df.login.isin(stargazers_df.login)])} are unique. There were {len(stargazers_errors_df)} errors in getting stargazers (likely user accounts that no longer exist).")

From 2104 repos, we found 7561 stargazers, of which 3150 are unique. There were 12 errors in getting stargazers (likely user accounts that no longer exist).


In [24]:
get_url_field = 'subscribers_url'
load_existing_data = True
is_stargazers = False
subscribers_df, users_df = get_repos_user_actors(repo_df, '../data/repo_subscribers_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
subscribers_errors_df = check_return_error_file('../data/error_logs/repo_subscribers_join_dataset_errors.csv')

In [25]:
print(f"From {len(repo_df)} repos, we found {len(subscribers_df)} subscribers, of which {len(users_df[users_df.login.isin(subscribers_df.login)])} are unique. There were {len(subscribers_errors_df)} errors in getting subscribers (likely user accounts that no longer exist).")

From 2104 repos, we found 2684 subscribers, of which 1769 are unique. There were 0 errors in getting subscribers (likely user accounts that no longer exist).


In [None]:
get_url_field = 'collaborators_url'
load_existing_data = False
is_stargazers = False
collaborators_df, users_df = get_repos_user_actors(repo_df, '../data/repo_collaborators_join_dataset.csv', '../data/users_dataset.csv', rates_df, get_url_field, load_existing_data, is_stargazers)
collaborators_errors_df = check_return_error_file('../data/error_logs/repo_collaborators_join_dataset_errors.csv')

In [None]:
repo_df = get_repo_languages(repo_df, repo_output_path, rates_df)

In [None]:
repo_df = get_repo_labels(repo_df, repo_output_path, rates_df)

In [None]:
repo_df = get_repo_tags(repo_df, repo_output_path, rates_df)

In [None]:
commits_df = get_repos_commits(repo_df, '../private_data/search_tagged_dh_repos_commits.csv', rates_df)

### Date of Repo Creation

In [None]:
alt.Chart(repo_df).mark_bar().encode(
    x=alt.X("yearmonth(created_at):T", axis=alt.Axis(title="Date")),
    y=alt.Y("count()", axis=alt.Axis(title="")),
    color=alt.Color("yearmonth(created_at):T", legend=None, scale=alt.Scale(scheme='plasma')),
).properties(
    title="Frequency of DH Topic Repositories Created by Year and Month",
)

In [None]:
subset_df = repo_df[['forks_count', 'stargazers_count', 'watchers_count', 'size', 'html_url', 'created_at', 'full_name']]

In [None]:
subset_df['year'] = pd.to_datetime(subset_df['created_at']).dt.strftime('%Y')

In [None]:
cols = ['forks_count', 'stargazers_count', 'watchers_count', 'size']
reverse_cols = cols[::-1]

In [None]:
alt.Chart(subset_df).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color=alt.Color('year:N', scale=alt.Scale(scheme='plasma')),
    tooltip=['year:N', 'html_url:N', 'created_at:N', 'full_name:N'] 
).properties(
    width=125,
    height=125
).repeat(
    row=cols,
    column=reverse_cols
)

In [None]:
alt.Chart(repo_df).mark_bar().encode(
    y='count()',
    x='forks',
)

In [None]:
alt.Chart(repo_df).mark_bar().encode(
    y='count()',
    x='stargazers_count',
)