In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.renderers.enable('mimetype')
alt.data_transformers.disable_max_rows()
import os
import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append("..")
from data_generation_scripts.utils import save_chart

In [19]:
core_users = pd.read_csv('../data/derived_files/core_users.csv')
core_repos = pd.read_csv('../data/derived_files/core_repos.csv')

In [3]:
core_orgs = core_users[core_users.type == 'Organization']
core_users = core_users[core_users.type == 'User']

In [4]:
core_users['created_at'] = pd.to_datetime(core_users['created_at'], errors='coerce')
core_orgs['created_at'] = pd.to_datetime(core_orgs['created_at'], errors='coerce')
core_repos['created_at'] = pd.to_datetime(core_repos['created_at'], errors='coerce')

In [5]:
core_users['data_type'] = 'User'
core_orgs['data_type'] = 'Organization'
core_repos['data_type'] = 'Repository'

In [20]:
len(core_users[core_users.email.notna()]), len(core_users[core_users.email.isna()])

(1138, 3249)

In [11]:
len(core_orgs[core_orgs.email.notna()]), len(core_orgs[core_orgs.email.isna()])

(131, 290)

In [15]:
len(core_repos)

2326

In [18]:
all_data = pd.concat([core_users[['data_type', 'created_at', 'id']], core_orgs[['data_type', 'created_at', 'id']], core_repos[['data_type', 'created_at', 'id']]])

In [19]:
all_data['total_counts'] = 0
all_data.loc[all_data.data_type == 'User', 'total_counts'] = core_users.shape[0]
all_data.loc[all_data.data_type == 'Organization', 'total_counts'] = core_orgs.shape[0]
all_data.loc[all_data.data_type == 'Repository', 'total_counts'] = core_repos.shape[0]

In [20]:
all_data['month'] = all_data['created_at'].dt.month
all_data['year'] = all_data['created_at'].dt.year

In [21]:
grouped_data = all_data.groupby(['data_type', 'year', 'month']).size().reset_index(name='counts')

grouped_data['day'] = 1
# grouped_data['month'] = 1
grouped_data['date'] = pd.to_datetime(grouped_data[['year', 'month', 'day']])
grouped_data.date = grouped_data.date.dt.tz_localize("America/New_York")


In [22]:
nearest = alt.selection_point(nearest=True, on='mouseover', encodings=['x'], empty='none')

line = alt.Chart(grouped_data).mark_line(interpolate='basis').encode(
    x=alt.X('date:T', title='Date Created', axis=alt.Axis(format='%Y')),
    y=alt.Y('counts', title='Counts'),
    color=alt.Color('data_type', title='Type of Entity')
)

selectors = alt.Chart(grouped_data).mark_point().encode(
    x='date:T',
    opacity=alt.value(0),
).add_params(
    nearest
)

points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)

text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'counts', alt.value(' '))
)

rules = alt.Chart(grouped_data).mark_rule(color='gray').encode(
    x='date:T',
).transform_filter(
    nearest
)

chart = alt.layer(line, selectors, points, rules, text,).properties(
    width=600,
    height=300,
    title='Number of Users, Organizations, and Repositories identified as DH, Created Over Time'
)
chart


<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [23]:
save_chart(chart, '../outputs/number_of_users_orgs_repos_created_over_time.png', scale_factor=2)

In [24]:
all_data.data_type.value_counts()

User            3966
Repository      2326
Organization     421
Name: data_type, dtype: int64