# Exploring the Overview of Core Materials

### Load Libraries and Datasets

In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.renderers.enable('mimetype')
alt.data_transformers.disable_max_rows()
import os
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../../')
from searching_for_DH.data_generation_scripts.utils import save_chart, read_combine_files

In [4]:
data_path  = "../../datasets/"

In [5]:
user_df = pd.read_csv(os.path.join(data_path, "large_files/entity_files/users_dataset.csv"))
repo_df = pd.read_csv(os.path.join(data_path, "large_files/entity_files/repos_dataset.csv"), low_memory=False)
org_df = pd.read_csv(os.path.join(data_path, "entity_files/orgs_dataset.csv"), low_memory=False)
search_queries_repo_join_df = pd.read_csv(os.path.join(data_path, "derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv"))
search_queries_user_join_df = pd.read_csv(os.path.join(data_path, "derived_files/updated_search_queries_user_join_subset_dh_dataset.csv"))

In [6]:
def process_core_materials(core_repos, core_users, core_orgs, include_month=True, include_day=False):
    core_users['created_at'] = pd.to_datetime(core_users['created_at'], errors='coerce')
    core_orgs['created_at'] = pd.to_datetime(core_orgs['created_at'], errors='coerce')
    core_repos['created_at'] = pd.to_datetime(core_repos['created_at'], errors='coerce')
    core_users['data_type'] = 'User'
    core_orgs['data_type'] = 'Organization'
    core_repos['data_type'] = 'Repository'
    all_data = pd.concat([core_users[['data_type', 'created_at', 'id']], core_orgs[['data_type', 'created_at', 'id']], core_repos[['data_type', 'created_at', 'id']]])
    all_data['total_counts'] = 0
    all_data.loc[all_data.data_type == 'User', 'total_counts'] = core_users.shape[0]
    all_data.loc[all_data.data_type == 'Organization', 'total_counts'] = core_orgs.shape[0]
    all_data.loc[all_data.data_type == 'Repository', 'total_counts'] = core_repos.shape[0]

    all_data['month'] = all_data['created_at'].dt.month
    all_data['year'] = all_data['created_at'].dt.year
    all_data['day'] = all_data['created_at'].dt.day
    if include_day:
        grouped_data = all_data.groupby(['data_type', 'year', 'month', 'day']).size().reset_index(name='counts')
    else:
        grouped_data = all_data.groupby(['data_type', 'year', 'month']).size().reset_index(name='counts')

    
    if include_month == False:
        grouped_data['month'] = 1
    if include_day == False:  
        grouped_data['day'] = 1
    grouped_data['date'] = pd.to_datetime(grouped_data[['year', 'month', 'day']])
    grouped_data.date = grouped_data.date.dt.tz_localize("America/New_York")
    return grouped_data

In [7]:
def overtime_chart(grouped_data, title):
    nearest = alt.selection_point(nearest=True, on='mouseover', encodings=['x'], empty='none')

    line = alt.Chart(grouped_data).mark_line(interpolate='basis').encode(
        x=alt.X('date:T', title='Date Created', axis=alt.Axis(format='%Y')),
        y=alt.Y('counts:Q', title='Counts'),
        color=alt.Color('data_type:N', title='Type of Entity')
    )

    selectors = alt.Chart(grouped_data).mark_point().encode(
        x='date:T',
        opacity=alt.value(0),
    ).add_params(
        nearest
    )

    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )

    text = line.mark_text(align='left', dx=5, dy=-5).encode(
        text=alt.condition(nearest, 'counts', alt.value(' '))
    )

    rules = alt.Chart(grouped_data).mark_rule(color='gray').encode(
        x='date:T',
    ).transform_filter(
        nearest
    )

    chart = alt.layer(line, selectors, points, rules, text,).properties(
        width=600,
        height=300,
        title=title
    )
    return chart

### Initial Core Users

In [8]:
initial_core_users = pd.read_csv(os.path.join(data_path, "derived_files/initial_core_users.csv"))
initial_core_repos = pd.read_csv(os.path.join(data_path, "derived_files/initial_core_repos.csv"))
initial_core_orgs = pd.read_csv(os.path.join(data_path, "derived_files/initial_core_orgs.csv"))


In [28]:
initial_core_users.finalized_language.value_counts()

en        630
it         32
es         15
pt         13
fr          8
en, it      7
en, de      6
de          6
en, es      4
it, en      3
None        2
en, ru      2
he, en      1
pl, en      1
ru          1
en, el      1
no          1
ko          1
de, en      1
en, fr      1
Name: finalized_language, dtype: int64

In [10]:
initial_grouped_data = process_core_materials(initial_core_repos, initial_core_users, initial_core_orgs, include_month=True, include_day=False)

In [41]:
initial_chart = overtime_chart(initial_grouped_data, "Number of Users, Organizations, and Repositories identified as DH, Created Over Time via Initial Search Queries")

In [42]:
initial_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### First Pass Core Users

In [43]:
first_pass_core_users = pd.read_csv(os.path.join(data_path, "derived_files/firstpass_core_users.csv"))
first_pass_core_repos = pd.read_csv(os.path.join(data_path, "derived_files/firstpass_core_repos.csv"))
first_pass_core_orgs = pd.read_csv(os.path.join(data_path, "derived_files/firstpass_core_orgs.csv"))

In [45]:
first_pass_grouped_data = process_core_materials(first_pass_core_repos, first_pass_core_users, first_pass_core_orgs, include_month=True)

In [46]:
first_pass_chart = overtime_chart(first_pass_grouped_data, "Number of Users, Organizations, and Repositories identified as DH, Created Over Time via First Pass")

In [47]:
first_pass_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### Final Pass Core Materials

In [48]:
finalpass_core_users = pd.read_csv(os.path.join(data_path, "derived_files/finalpass_core_users.csv"))
finalpass_core_repos = pd.read_csv(os.path.join(data_path, "large_files/derived_files/finalpass_core_repos.csv"))
finalpass_core_orgs = pd.read_csv(os.path.join(data_path, "derived_files/finalpass_core_orgs.csv"))

In [49]:
finalpass_grouped_data = process_core_materials(finalpass_core_repos, finalpass_core_users, finalpass_core_orgs, include_month=True)

In [50]:
finalpass_chart = overtime_chart(finalpass_grouped_data, "Number of Users, Organizations, and Repositories identified as DH, Created Over Time via Final Pass Thresholding Algorithm")

In [51]:
finalpass_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### Combined Core Materials

In [52]:
initial_core_users['origin'] = 'initial'
first_pass_core_users['origin'] = 'first_pass'
finalpass_core_users['origin'] = 'final_pass'

initial_core_repos['origin'] = 'initial'
first_pass_core_repos['origin'] = 'first_pass'
finalpass_core_repos['origin'] = 'final_pass'

initial_core_orgs['origin'] = 'initial'
first_pass_core_orgs['origin'] = 'first_pass'
finalpass_core_orgs['origin'] = 'final_pass'

In [53]:
core_users = pd.concat([initial_core_users, first_pass_core_users, finalpass_core_users])
core_repos = pd.concat([initial_core_repos, first_pass_core_repos, finalpass_core_repos])
core_orgs = pd.concat([initial_core_orgs, first_pass_core_orgs, finalpass_core_orgs])


In [54]:
combined_grouped_data = process_core_materials(core_repos, core_users, core_orgs, include_month=True)

In [55]:
combined_chart = overtime_chart(combined_grouped_data, "Number of Users, Organizations, and Repositories identified as DH, Created Over Time via All Three Methods")

In [56]:
combined_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [57]:
core_users['created_at'] = pd.to_datetime(core_users['created_at'], errors='coerce')
core_orgs['created_at'] = pd.to_datetime(core_orgs['created_at'], errors='coerce')
core_repos['created_at'] = pd.to_datetime(core_repos['created_at'], errors='coerce')
core_users['data_type'] = 'User'
core_orgs['data_type'] = 'Organization'
core_repos['data_type'] = 'Repository'
all_data = pd.concat([core_users[['data_type', 'created_at', 'id', 'origin']], core_orgs[['data_type', 'created_at', 'id', 'origin']], core_repos[['data_type', 'created_at', 'id', 'origin']]])
all_data['total_counts'] = 0
all_data.loc[all_data.data_type == 'User', 'total_counts'] = core_users.shape[0]
all_data.loc[all_data.data_type == 'Organization', 'total_counts'] = core_orgs.shape[0]
all_data.loc[all_data.data_type == 'Repository', 'total_counts'] = core_repos.shape[0]

all_data['month'] = all_data['created_at'].dt.month
all_data['year'] = all_data['created_at'].dt.year
# all_data['day'] = all_data['created_at'].dt.day
grouped_data = all_data.groupby(['origin', 'data_type', 'year', 'month']).size().reset_index(name='counts')
grouped_data['day'] = 1
grouped_data['date'] = pd.to_datetime(grouped_data[['year', 'month', 'day']])
grouped_data.date = grouped_data.date.dt.tz_localize("America/New_York")

In [59]:
charts = []
for data_type in grouped_data.data_type.unique().tolist():
    subset_data = grouped_data[grouped_data.data_type == data_type]
    chart = alt.Chart(subset_data).mark_line(interpolate='basis').encode(
        x=alt.X('date:T', title='Date Created', axis=alt.Axis(format='%Y')),
        y=alt.Y('counts',
        scale=alt.Scale(zero=False)),
        column='data_type',
        color='origin',
        tooltip=["date:T", "counts", "origin", "data_type"]
    ).properties(
        width=600,
        # title=f"Number of Users, Organizations, and Repositories identified as DH, Created Over Time via All Three Methods, by Data Type: {data_type}"
    )
    charts.append(chart)

In [79]:
final_chart = alt.vconcat(*charts).resolve_scale(y='independent', x='independent').properties(title="Number of Users, Organizations, and Repositories identified as DH, Created Over Time via All Three Methods, by Data Type")

final_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [75]:
origin_mapping = {
    'initial': 'Initial Search Queries',
    'first_pass': 'First Pass Thresholding',
    'final_pass': 'Final Pass Thresholding'
}
grouped_data['origin'] = grouped_data['origin'].map(origin_mapping)

In [77]:
chart = alt.Chart(grouped_data).mark_bar().encode(
        x=alt.X('date:T', title='Date First Created', axis=alt.Axis(format='%Y')),
        y=alt.Y('counts',
        scale=alt.Scale(zero=False), stack=True),
        column=alt.Column('origin', sort=['Initial Search Queries', 'First Pass Thresholding', 'Final Pass Thresholding'], title='Method of Data Collection'),
        color=alt.Color('data_type', legend=alt.Legend(title="Type of GitHub Entity")),
        tooltip=["date:T", "counts", "origin", "data_type"]
    ).properties(
        width=300,
        title=f"Number of Users, Organizations, and Repositories identified as DH, Created Over Time via All Three Methods of Data Collection"
    )
chart = chart.configure(
        title=alt.TitleConfig(fontSize=20, font='Garamond', anchor='middle'),
        axis=alt.AxisConfig(labelFont='Garamond', titleFont='Garamond'),
        legend=alt.LegendConfig(labelFont='Garamond', titleFont='Garamond')
    )

In [78]:
save_chart(chart, '../outputs/number_of_users_orgs_repos_created_over_time.png', scale_factor=2)