# Exploring the Overview of Core Materials

### Load Libraries and Datasets

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.renderers.enable('mimetype')
alt.data_transformers.disable_max_rows()
import os
import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append("..")
from data_generation_scripts.utils import save_chart, read_combine_files
from data_generation_scripts.generate_translations import check_detect_language

In [2]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

In [15]:
def process_core_materials(core_repos, core_users, core_orgs, include_month=True):
    core_users['created_at'] = pd.to_datetime(core_users['created_at'], errors='coerce')
    core_orgs['created_at'] = pd.to_datetime(core_orgs['created_at'], errors='coerce')
    core_repos['created_at'] = pd.to_datetime(core_repos['created_at'], errors='coerce')
    core_users['data_type'] = 'User'
    core_orgs['data_type'] = 'Organization'
    core_repos['data_type'] = 'Repository'
    all_data = pd.concat([core_users[['data_type', 'created_at', 'id']], core_orgs[['data_type', 'created_at', 'id']], core_repos[['data_type', 'created_at', 'id']]])
    all_data['total_counts'] = 0
    all_data.loc[all_data.data_type == 'User', 'total_counts'] = core_users.shape[0]
    all_data.loc[all_data.data_type == 'Organization', 'total_counts'] = core_orgs.shape[0]
    all_data.loc[all_data.data_type == 'Repository', 'total_counts'] = core_repos.shape[0]

    all_data['month'] = all_data['created_at'].dt.month
    all_data['year'] = all_data['created_at'].dt.year
    grouped_data = all_data.groupby(['data_type', 'year', 'month']).size().reset_index(name='counts')

    grouped_data['day'] = 1
    if include_month == False:
        grouped_data['month'] = 1
    grouped_data['date'] = pd.to_datetime(grouped_data[['year', 'month', 'day']])
    grouped_data.date = grouped_data.date.dt.tz_localize("America/New_York")
    return grouped_data

In [12]:
def overtime_chart(grouped_data, title):
    nearest = alt.selection_point(nearest=True, on='mouseover', encodings=['x'], empty='none')

    line = alt.Chart(grouped_data).mark_line(interpolate='basis').encode(
        x=alt.X('date:T', title='Date Created', axis=alt.Axis(format='%Y')),
        y=alt.Y('counts:Q', title='Counts'),
        color=alt.Color('data_type:N', title='Type of Entity')
    )

    selectors = alt.Chart(grouped_data).mark_point().encode(
        x='date:T',
        opacity=alt.value(0),
    ).add_params(
        nearest
    )

    points = line.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )

    text = line.mark_text(align='left', dx=5, dy=-5).encode(
        text=alt.condition(nearest, 'counts', alt.value(' '))
    )

    rules = alt.Chart(grouped_data).mark_rule(color='gray').encode(
        x='date:T',
    ).transform_filter(
        nearest
    )

    chart = alt.layer(line, selectors, points, rules, text,).properties(
        width=600,
        height=300,
        title=title
    )
    return chart

### Initial Core Users

In [3]:
initial_core_users = pd.read_csv("../data/derived_files/initial_core_users.csv")
initial_core_repos = pd.read_csv("../data/derived_files/initial_core_repos.csv")
initial_core_orgs = pd.read_csv("../data/derived_files/initial_core_orgs.csv")

In [16]:
initial_grouped_data = process_core_materials(initial_core_repos, initial_core_users, initial_core_orgs, include_month=True)

In [17]:
initial_chart = overtime_chart(initial_grouped_data, "Number of Users, Organizations, and Repositories identified as DH, Created Over Time via Initial Search Queries")

In [18]:
initial_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


### First Pass Core Users

In [19]:
first_pass_core_users = pd.read_csv("../data/derived_files/core_users.csv")
first_pass_core_repos = pd.read_csv("../data/derived_files/core_repos.csv")
first_pass_core_orgs = pd.read_csv("../data/derived_files/core_orgs.csv")

In [20]:
first_pass_grouped_data = process_core_materials(first_pass_core_repos, first_pass_core_users, first_pass_core_orgs, include_month=True)

In [21]:
first_pass_chart = overtime_chart(first_pass_grouped_data, "Number of Users, Organizations, and Repositories identified as DH, Created Over Time via First Pass")

In [22]:
first_pass_chart

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


: 

In [23]:
# save_chart(chart, '../outputs/number_of_users_orgs_repos_created_over_time.png', scale_factor=2)