# Explore Patterns in DH Repos

- Date Created and Updated
- Size
- Programming Language
- Natural Language
- Number of user interactions

### Load Libraries and Datasets

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.renderers.enable('mimetype')
alt.data_transformers.disable_max_rows()
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler


In [70]:

repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)

initial_core_users = pd.read_csv("../data/derived_files/initial_core_users.csv")
initial_core_users['origin'] = 'initial_core'
initial_core_repos = pd.read_csv("../data/derived_files/initial_core_repos.csv")
initial_core_repos['origin'] = 'initial_core'
initial_core_orgs = pd.read_csv("../data/derived_files/initial_core_orgs.csv")
initial_core_orgs['origin'] = 'initial_core'

firstpass_core_users = pd.read_csv("../data/derived_files/firstpass_core_users.csv")
firstpass_core_users['origin'] = 'firstpass_core'
firstpass_core_repos = pd.read_csv("../data/derived_files/firstpass_core_repos.csv")
firstpass_core_repos['origin'] = 'firstpass_core'
firstpass_core_orgs = pd.read_csv("../data/derived_files/firstpass_core_orgs.csv")
firstpass_core_orgs['origin'] = 'firstpass_core'

finalpass_core_users = pd.read_csv("../data/derived_files/finalpass_core_users.csv")
finalpass_core_users['origin'] = 'finalpass_core'
finalpass_core_repos = pd.read_csv("../data/large_files/derived_files/finalpass_core_repos.csv", low_memory=False, on_bad_lines='skip')
finalpass_core_repos['origin'] = 'finalpass_core'
finalpass_core_orgs = pd.read_csv("../data/derived_files/finalpass_core_orgs.csv")
finalpass_core_orgs['origin'] = 'finalpass_core'

In [71]:
len(initial_core_repos)

2485

In [72]:
len(firstpass_core_repos)

12910

In [77]:
core_users = pd.concat([initial_core_users, firstpass_core_users, finalpass_core_users])
core_repos = pd.concat([initial_core_repos, firstpass_core_repos, finalpass_core_repos])
core_orgs = pd.concat([initial_core_orgs, firstpass_core_orgs, finalpass_core_orgs])

### Check for Correlation in Data

In [78]:
def generate_heatmap_chart(corr_matrix, origin_type):

    # Melt the correlation matrix to long format
    corr_df = corr_matrix.reset_index().melt('index')

    # Rename the columns for Altair
    corr_df.columns = ['x', 'y', 'correlation']

    # Generate the heatmap
    heatmap = alt.Chart(corr_df).mark_rect().encode(
        x=alt.X('x:O', title=None),
        y=alt.Y('y:O',  sort=alt.EncodingSortField('y', order='descending'), title=None),
        color='correlation:Q',
        tooltip=[
            alt.Tooltip('x', title='Variable 1'),
            alt.Tooltip('y', title='Variable 2'),
            alt.Tooltip('correlation', title='Correlation')
        ]
    )

    # Add the text overlay with the correlation values
    text = heatmap.mark_text(baseline='middle').encode(
        text=alt.Text('correlation:Q', format='.2f'),
        color=alt.condition(
            alt.datum.correlation > 0.95,
            alt.value('black'),
            alt.value('white')
        )
    )

    # Display the final chart
    heatmap_chart = (heatmap + text).properties(
        width=alt.Step(40),
        height=alt.Step(40),
        title=f"Correlation matrix heatmap for {origin_type} core users"
    )
    return heatmap_chart

In [81]:
types_repos = core_repos.origin.unique().tolist()
categorical_features = ['description', 'archived', 'has_wiki', 'has_downloads', 'has_projects', 'has_issues', 'homepage', 'fork']  # update this with actual column names
#try exploding topics 
numerical_features = ['forks', 'watchers', 'open_issues_count', 'watchers_count', 'stargazers_count', 'size', 'health_percentage']  # update this with actual column names

matrices = []
for origin_type in types_repos:
    matrix_repos = core_repos[core_repos.origin == origin_type].copy()
    # Convert categorical features into binary
    for feature in categorical_features:
        matrix_repos[feature] = matrix_repos[feature].notna().astype(int)

    # Scale numerical features
    scaler = StandardScaler()
    matrix_repos[numerical_features] = scaler.fit_transform(matrix_repos[numerical_features])

    # Create a correlation matrix
    corr_matrix = matrix_repos[categorical_features + numerical_features].corr()
    heatmap_chart = generate_heatmap_chart(corr_matrix, origin_type)
    matrices.append(heatmap_chart)

In [82]:
alt.hconcat(*matrices)

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [83]:
core_repos['languages'].isna().sum(), core_repos['languages'].notna().sum()

(11675, 30334)