# Notebook for Reorganizing Data in Coding DH

Realized we needed to improve how we archived our data so that we could get historic snapshots. This notebook is for reorganizing the data.

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import rich
from rich.console import Console
console = Console()
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [38]:
# Standard library imports
import os
import re
import shutil
import time
import warnings
from datetime import datetime
from typing import List, Optional, Union

# Related third-party imports
import altair as alt
import apikey
import numpy as np
import pandas as pd
import requests
from rich import print
from rich.console import Console
from tqdm import tqdm

# Local application/library specific imports
import vl_convert as vlc

# Filter warnings
warnings.filterwarnings('ignore')

# Load auth token
auth_token = apikey.load("DH_GITHUB_DATA_PERSONAL_TOKEN")

 -H "Accept: application/vnd.github+json" \
  -H "Authorization: Bearer <YOUR-TOKEN>" \
  -H "X-GitHub-Api-Version: 2022-11-28" \
auth_headers = {'Authorization': f'token {auth_token}','User-Agent': 'request', 'Accept': 'application/vnd.github+json', 'X-GitHub-Api-Version': '2022-11-28'}

import sys
sys.path.append("../")
from data_generation_scripts.general_utils import *

In [115]:
url = "https://api.github.com/repos/octocat/Hello-World"

In [116]:
# Get total number of pages
response = make_request_with_rate_limiting(f'{url}', auth_headers)
# # If response is None or there are no links, return 1
# if response is None or len(response.links) == 0:
#     return 0
# # Otherwise, get the last page number
# match = re.search(r'\d+$', response.links['last']['url'])

In [117]:
response_df = pd.json_normalize(response.json())

In [120]:
response_df.columns.tolist()

['id',
 'node_id',
 'name',
 'full_name',
 'private',
 'html_url',
 'description',
 'fork',
 'url',
 'forks_url',
 'keys_url',
 'collaborators_url',
 'teams_url',
 'hooks_url',
 'issue_events_url',
 'events_url',
 'assignees_url',
 'branches_url',
 'tags_url',
 'blobs_url',
 'git_tags_url',
 'git_refs_url',
 'trees_url',
 'statuses_url',
 'languages_url',
 'stargazers_url',
 'contributors_url',
 'subscribers_url',
 'subscription_url',
 'commits_url',
 'git_commits_url',
 'comments_url',
 'issue_comment_url',
 'contents_url',
 'compare_url',
 'merges_url',
 'archive_url',
 'downloads_url',
 'issues_url',
 'pulls_url',
 'milestones_url',
 'notifications_url',
 'labels_url',
 'releases_url',
 'deployments_url',
 'created_at',
 'updated_at',
 'pushed_at',
 'git_url',
 'ssh_url',
 'clone_url',
 'svn_url',
 'homepage',
 'size',
 'stargazers_count',
 'watchers_count',
 'language',
 'has_issues',
 'has_projects',
 'has_downloads',
 'has_wiki',
 'has_pages',
 'has_discussions',
 'forks_count',


In [104]:
match = re.search(r'\d+$', response.links['last']['url'])
int(match.group()) if match is not None else 0

599

<re.Match object; span=(73, 75), match='41'>

In [114]:
name = "programminghistorian_ph-submissions_coding_dh_repo"
df = pd.read_csv(f"../../new_datasets/historic_data/entity_files/all_repos/{name}.csv")
cols_df = pd.read_csv("../../new_datasets/metadata_files/repo_url_cols.csv")
skip_types = ['review_comments_count', 'commits_count', 'collaborators_count']
cols_df = cols_df[~cols_df.count_column.isin(skip_types)]
remove_cols = ['comments_count',
 'review_count',
 'pulls_count',
 'owner.organization_count', 'issues_count', 'contributors_count']
df.loc[:, ~df.columns.isin(skip_types + remove_cols)].to_csv(f"../../new_datasets/historic_data/entity_files/all_repos/{name}.csv", index=False)

In [105]:
cols_df

Unnamed: 0,cols,url_column,count_column,check_state
0,"['login', 'id', 'node_id', 'url', 'html_url', ...",issues_url,issues_count,True
1,"['login', 'id', 'node_id', 'url', 'html_url', ...",subscribers_url,subscribers_count,False
2,"['login', 'id', 'node_id', 'url', 'html_url', ...",contributors_url,contributors_count,False
3,"['login', 'id', 'node_id', 'url', 'html_url', ...",forks_url,forks_count,False
4,"['starred_at', 'login', 'id', 'url', 'html_url...",stargazers_url,stargazers_count,False
5,"['url', 'html_url', 'issue_url', 'id', 'node_i...",comments_url,comments_count,True
6,"['url', 'pull_request_review_id', 'id', 'node_...",review_comments_url,review_count,True
7,"['url', 'id', 'node_id', 'html_url', 'diff_url...",pulls_url,pulls_count,True
9,"['login', 'id', 'node_id', 'url', 'repos_url',...",owner.organizations_url,owner.organization_count,False


In [84]:
cols_df.count_column.tolist()

['issues_count',
 'subscribers_count',
 'contributors_count',
 'forks_count',
 'stargazers_count',
 'comments_count',
 'review_count',
 'pulls_count',
 'owner.organization_count']

In [83]:
df.columns

Index(['id', 'node_id', 'name', 'full_name', 'private', 'html_url',
       'description', 'fork', 'url', 'forks_url',
       ...
       'template_repository.open_issues', 'template_repository.clone_url',
       'template_repository.pulls_url', 'coding_dh_id', 'issues_count',
       'subscribers_count', 'contributors_count', 'comments_count',
       'pulls_count', 'owner.organization_count'],
      dtype='object', length=215)

In [80]:
cols_df

Unnamed: 0,cols,url_column,count_column,check_state
0,"['login', 'id', 'node_id', 'url', 'html_url', ...",issues_url,issues_count,True
1,"['login', 'id', 'node_id', 'url', 'html_url', ...",subscribers_url,subscribers_count,False
2,"['login', 'id', 'node_id', 'url', 'html_url', ...",contributors_url,contributors_count,False
3,"['login', 'id', 'node_id', 'url', 'html_url', ...",forks_url,forks_count,False
4,"['starred_at', 'login', 'id', 'url', 'html_url...",stargazers_url,stargazers_count,False
5,"['url', 'html_url', 'issue_url', 'id', 'node_i...",comments_url,comments_count,True
6,"['url', 'pull_request_review_id', 'id', 'node_...",review_comments_url,review_count,True
7,"['url', 'id', 'node_id', 'html_url', 'diff_url...",pulls_url,pulls_count,True
9,"['login', 'id', 'node_id', 'url', 'repos_url',...",owner.organizations_url,owner.organization_count,False


In [81]:
df['stargazers_count'] = 0
df['watchers_count'] = 0
df['forks_count'] = 0
df['subscribers_count'] = 2
df.to_csv(f"../../new_datasets/historic_data/entity_files/all_repos/{name}.csv", index=False)

In [62]:
cols_df.count_column.tolist()

['issues_count',
 'subscribers_count',
 'contributors_count',
 'forks_count',
 'stargazers_count',
 'comments_count',
 'review_count',
 'pulls_count',
 'owner.organization_count']

In [63]:
df['subscribers_count']

0    0.0
Name: subscribers_count, dtype: float64

In [26]:
df.loc[:, ~df.columns.isin(skip_types)].to_csv("../../new_datasets/historic_data/entity_files/all_repos/Ifthakharmahmud_public-history_coding_dh_repo.csv", index=False)

In [70]:
os.path.exists("../../datasets/temp/temp_users/giuliataurino_potential_users.csv_user.csv")

False

In [71]:
os.path.exists("../../new_datasets/historic_data/entity_files/all_users/melaniewalsh_coding_dh_users.csv")

True

In [118]:
test3 = pd.read_csv("../../new_datasets/historic_data/entity_files/all_users/ZoeLeBlanc_coding_dh_users.csv")
test2 = pd.read_csv("../../datasets/temp/temp_users/ZoeLeBlanc_coding_dh_user.csv")

In [100]:
test3

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,private_gists,total_private_repos,owned_private_repos,disk_usage,collaborators,two_factor_authentication,plan.name,plan.space,plan.collaborators,plan.private_repos
0,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,
1,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,
2,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,
3,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,
4,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,1.0,47.0,47.0,2308671.0,1.0,True,pro,976562499.0,0.0,9999.0


In [89]:
testing = pd.concat([test, test2])

In [119]:
test2 = test2.drop(columns=["user_query_time", "coding_dh_id"])

In [121]:
test2.to_csv("../../new_datasets/historic_data/entity_files/all_users/ZoeLeBlanc_coding_dh_user.csv", index=False)

In [102]:
test2

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,permissions.maintain,permissions.push,permissions.triage,permissions.pull,license.key,license.name,license.spdx_id,license.url,license.node_id,coding_dh_id
0,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,0
1,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,1
2,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,2
3,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,3


In [128]:
import os
import datetime

# Get the current time
now = datetime.datetime.now()

# Get the time 24 hours ago
twenty_four_hours_ago = now - datetime.timedelta(hours=24)

# Directory to check
directory = "../../new_datasets/historic_data/entity_files/all_users/"
modified_user_files = []
# Loop over all files in the directory
for filename in os.listdir(directory):
    # Get the full path of the file
    filepath = os.path.join(directory, filename)
    
    # Get the modification time of the file
    filetime = datetime.datetime.fromtimestamp(os.path.getmtime(filepath))
    
    # If the file was modified in the last 24 hours, print its name
    if filetime > twenty_four_hours_ago:
        modified_user_files.append(filename)

In [129]:
len(modified_user_files)

828

In [130]:
modified_user_names = [file.split("_coding_dh_user")[0] for file in modified_user_files]
older_files = os.listdir("../../datasets/temp/temp_users/")


In [131]:
subset_older_files = []

for name in modified_user_names:
    subset_older_files.extend([file for file in older_files if name in file])

In [132]:
len(subset_older_files)

737

In [133]:
import shutil

for file in tqdm(subset_older_files[0:2], desc="Copying files"):
    # Copy file to new directory, replacing any existing file with the same name
    print(file)
    shutil.copy2("../../datasets/temp/temp_users/" + file, "../../new_datasets/historic_data/entity_files/all_users/" + file)

Copying files: 100%|██████████| 2/2 [00:00<00:00, 156.51it/s]

glorieux-f_coding_dh_user.csv
leo8a_coding_dh_user.csv





In [91]:
from typing import List, Optional, Union
def sort_groups_add_coding_dh_id(group: pd.DataFrame, subset_columns: List) -> pd.DataFrame:
    """
    Sorts a DataFrame group based on 'coding_dh_date' and adds a new column 'coding_dh_id' with unique identifiers.
    If the group has more than one unique row (excluding subset_columns), each row gets a unique identifier.
    If the group has only one unique row (excluding subset_columns), it gets the identifier 0.

    Parameters:
    group (pd.DataFrame): DataFrame group to sort and add identifiers to.
    subset_columns (List[str]): List of column names to exclude when checking for unique rows.

    Returns:
    pd.DataFrame: The sorted DataFrame group with the new 'coding_dh_id' column.
    """
    # Initialize list to keep track of columns with lists
    list_cols = []

    # Convert lists to comma-separated strings
    for col in group.columns:
        if group[col].apply(lambda x: isinstance(x, list)).any():
            # Replace nulls with empty lists
            group[col] = group[col].apply(lambda x: x if isinstance(x, list) else [])
            
            # Sort lists alphabetically and join into a string
            group[f'combined_{col}'] = group[col].apply(lambda x: ', '.join(sorted(map(str, x))))
            
            # Add column to list_cols
            subset_columns.append(col)
            list_cols.append(f'combined_{col}')

    group = group.drop_duplicates(subset=group.columns.difference(subset_columns))
    if (group.drop(columns=subset_columns).nunique() > 1).any():
        group = group.sort_values('coding_dh_date')
        group['coding_dh_id'] = np.arange(len(group))
    else:
        group = group.sort_values('coding_dh_date').iloc[0:1]
        group['coding_dh_id'] = 0

    # Drop the combined columns
    group = group.drop(columns=list_cols)

    return group

In [92]:
grouped_dfs = testing.groupby("login")
processed_files = []
for _, group in tqdm(grouped_dfs, desc=f"Grouping files"):
    subset_columns = ["coding_dh_date"]
    group = sort_groups_add_coding_dh_id(group, subset_columns)
    processed_files.append(group)
final_processed_df = pd.concat(processed_files).reset_index(drop=True)

Grouping files: 100%|██████████| 1/1 [00:00<00:00, 18.42it/s]


In [93]:
final_processed_df

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,permissions.maintain,permissions.push,permissions.triage,permissions.pull,license.key,license.name,license.spdx_id,license.url,license.node_id,coding_dh_id
0,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,0
1,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,1
2,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,2
3,ZoeLeBlanc,8355129.0,MDQ6VXNlcjgzNTUxMjk=,https://avatars.githubusercontent.com/u/835512...,,https://api.github.com/users/ZoeLeBlanc,https://github.com/ZoeLeBlanc,https://api.github.com/users/ZoeLeBlanc/followers,https://api.github.com/users/ZoeLeBlanc/follow...,https://api.github.com/users/ZoeLeBlanc/gists{...,...,,,,,,,,,,3


In [79]:
files = os.listdir("../../datasets/temp/temp_users")
# other_files = os.listdir("../../new_datasets/historic_data/entity_files/all_users")
# len(files), len(other_files)

In [80]:
exists = [ file for file in files if 'ZoeLeBlanc' in file]
print(exists)

['ZoeLeBlanc_coding_dh_user.csv']


In [57]:
headers = pd.read_csv("../../new_datasets//metadata_files/repo_headers.csv")

In [58]:
headers

Unnamed: 0,id,node_id,name,full_name,private,html_url,description,fork,url,forks_url,...,permissions.admin,permissions.maintain,permissions.push,permissions.triage,permissions.pull,license.key,license.name,license.spdx_id,license.url,license.node_id
0,,,,,,,,,,,...,,,,,,,,,,


In [56]:
search_repo_queries_df[search_repo_queries_df.full_name == "AlexBurger/obdurodon-statistics"].to_dict()

{'id': {0: 27790226},
 'node_id': {0: 'MDEwOlJlcG9zaXRvcnkyNzc5MDIyNg=='},
 'name': {0: 'obdurodon-statistics'},
 'full_name': {0: 'AlexBurger/obdurodon-statistics'},
 'private': {0: False},
 'html_url': {0: 'https://github.com/AlexBurger/obdurodon-statistics'},
 'description': {0: 'A statistical primer for an undergraduate course in Digital Humanities at the University of Pittsburgh'},
 'fork': {0: False},
 'url': {0: 'https://api.github.com/repos/AlexBurger/obdurodon-statistics'},
 'forks_url': {0: 'https://api.github.com/repos/AlexBurger/obdurodon-statistics/forks'},
 'keys_url': {0: 'https://api.github.com/repos/AlexBurger/obdurodon-statistics/keys{/key_id}'},
 'collaborators_url': {0: 'https://api.github.com/repos/AlexBurger/obdurodon-statistics/collaborators{/collaborator}'},
 'teams_url': {0: 'https://api.github.com/repos/AlexBurger/obdurodon-statistics/teams'},
 'hooks_url': {0: 'https://api.github.com/repos/AlexBurger/obdurodon-statistics/hooks'},
 'issue_events_url': {0: 'htt

In [54]:
len(set(files)), len(set(other_files))

(127377, 128160)

In [15]:
cleaned_terms.search_term_source.value_counts()

Digital Humanities    106
Name: search_term_source, dtype: int64

In [24]:
search_org_queries_df.login.nunique(), len(search_org_queries_df), search_user_queries_df.login.nunique(), len(search_user_queries_df), search_repo_queries_df.full_name.nunique(), len(search_repo_queries_df)

(186, 333, 838, 1421, 2725, 4018)

In [26]:
def get_test_new_entities(entity_type:str, potential_new_entities_df: pd.DataFrame, temp_entity_dir: str, entity_progress_bar: tqdm, error_file_path: str):
    """
    Gets new entities from GitHub API. 

    :param entity_type: Type of entity
    :param potential_new_entities_df: Potential new entities dataframe
    :param temp_entity_dir: Temporary entity directory
    :param entity_progress_bar: Entity progress bar
    :param error_file_path: Path to error file
    :param overwrite_existing_temp_files: Boolean indicating whether to overwrite existing temporary files. Defaults to True.
    :return: Combined entity dataframe
    """

    # Create temporary directory if it doesn't exist
    if not os.path.exists(temp_entity_dir):
        os.makedirs(temp_entity_dir, exist_ok=True)
    
    # Subset headers for orgs and users
    user_cols = ["bio", "followers_url", "following_url", "gists_url", "gravatar_id", "hireable", "organizations_url","received_events_url", "site_admin", "starred_url",
    "subscriptions_url","login",]

    excluded_file_path = f'{data_directory_path}/metadata_files/excluded_{entity_type}.csv'
    error_file_path = f"{data_directory_path}/error_logs/potential_{entity_type}_errors.csv"

    # Get entity column based on entity type
    entity_column = "full_name" if (entity_type == "repos") else "login"

    if os.path.exists(excluded_file_path):
        excluded_entities = read_csv_file(excluded_file_path)
        # Exclude entities and check for errors
        potential_new_entities_df = potential_new_entities_df[~potential_new_entities_df[entity_column].isin(excluded_entities[entity_column])]

    # Get headers
    headers = get_headers(entity_type)

    # Update progress bar
    entity_progress_bar.total = len(potential_new_entities_df)
    entity_progress_bar.refresh()

    # Loop through potential new entities
    for _, row in potential_new_entities_df.iterrows():
        try:
            # Create temporary file path
            temp_entities_path = f"{row[entity_column].replace('/', '_').replace(' ', '_')}_coding_dh_{entity_type}.csv"
            console.print(f"Processing {temp_entities_path}", style="bold blue")
            # Check if file exists
            if os.path.exists(f"{temp_entity_dir}/{temp_entities_path}"):
                existing_temp_entities_df = read_csv_file(f"{temp_entity_dir}/{temp_entities_path}")
            else:
                existing_temp_entities_df = pd.DataFrame()
            # Get query
            query = row.url
            if entity_type == "orgs":
                query = row.url if "/users/" in row.url else row.url.replace("/orgs/", "/users/")
            # Make request
            response = make_request_with_rate_limiting(query, auth_headers)
            # If response is None, update progress bar and continue
            if response is None and entity_type != "orgs":
                entity_progress_bar.update(1)
                continue
            # If response is None and entity type is orgs, create empty dataframe
            elif response is None and entity_type == "orgs":
                response_df = pd.DataFrame(columns=headers.columns, data=None, index=None)
            else:
                response_data = response.json()
                response_df = pd.json_normalize(response_data)
                if "message" in response_df.columns:
                    console.print(f"Error for {row[entity_column]}: {response_df.message.values[0]}", style="bold red")
                    entity_progress_bar.update(1)
                    continue
            
            if entity_type != "orgs":
                final_df = response_df[headers.columns]
            else:
                response_df = response_df[user_cols]
                query = row.url.replace("/users/", "/orgs/") if "/users/" in row.url else row.url
                response = make_request_with_rate_limiting(query, auth_headers)
                if response is None:
                    expanded_df = pd.DataFrame(columns=headers.columns, data=None, index=None)
                else:
                    response_data = response.json()
                    expanded_df = pd.json_normalize(response_data)
                expanded_df = expanded_df[headers.columns]
                common_columns = list(set(response_df.columns).intersection(set(expanded_df.columns)))
                final_df = pd.merge(response_df, expanded_df, on=common_columns, how='left')
                
            final_df["coding_dh_date"] = datetime.now().strftime("%Y-%m-%d")
            combined_df = pd.concat([existing_temp_entities_df, final_df])
            grouped_dfs = combined_df.groupby(entity_column)
            processed_files = []
            for _, group in tqdm(grouped_dfs, desc=f"Grouping files"):
                subset_columns = ["coding_dh_date"]
                group = sort_groups_add_coding_dh_id(group, subset_columns)
                processed_files.append(group)
            final_processed_df = pd.concat(processed_files).reset_index(drop=True)
            final_processed_df.to_csv(f"{temp_entity_dir}/{temp_entities_path}", index=False)
            entity_progress_bar.update(1)
        except Exception as e:
            console.print(f"Error for {row[entity_column]}: {e}", style="bold red")
            error_df = pd.DataFrame([{entity_column: row[entity_column], "error_time": time.time(), "error_url": row.url}])
            if os.path.exists(error_file_path):
                error_df.to_csv(error_file_path, mode='a', header=False, index=False)
            else:
                error_df.to_csv(error_file_path, index=False)
            entity_progress_bar.update(1)
            continue

    # Read in all temporary files
    # combined_entity_df = read_combine_files(temp_entity_dir)
    entity_progress_bar.close()

In [27]:
entity_type = "orgs"
potential_new_entities_df = search_org_queries_df.drop_duplicates(subset=['login'])
potential_new_entities_df = potential_new_entities_df[0:5]
temp_entity_dir = f"{data_directory_path}/historic_data/entity_files/all_orgs/"
entity_progress_bar = tqdm(total=potential_new_entities_df.shape[0], desc="Processing entities")
error_file_path = f"{data_directory_path}/error_logs/org_errors.csv"


Processing entities:   0%|          | 0/5 [00:00<?, ?it/s]

In [40]:
groups = search_user_queries_df[search_user_queries_df.login.str.contains("Zoe")].groupby(["login"])

for _, group in groups:
    print((group.drop(columns=["coding_dh_date", "coding_dh_id"]).nunique() > 1).any())

False


In [31]:
search_org_queries_df[search_org_queries_df.login == "Digitaalhumanitaaria"].to_dict()

{'login': {0: 'Digitaalhumanitaaria'},
 'id': {0: 18483032},
 'node_id': {0: 'MDEyOk9yZ2FuaXphdGlvbjE4NDgzMDMy'},
 'avatar_url': {0: 'https://avatars.githubusercontent.com/u/18483032?v=4'},
 'gravatar_id': {0: nan},
 'url': {0: 'https://api.github.com/users/Digitaalhumanitaaria'},
 'html_url': {0: 'https://github.com/Digitaalhumanitaaria'},
 'followers_url': {0: 'https://api.github.com/users/Digitaalhumanitaaria/followers'},
 'following_url': {0: 'https://api.github.com/users/Digitaalhumanitaaria/following{/other_user}'},
 'gists_url': {0: 'https://api.github.com/users/Digitaalhumanitaaria/gists{/gist_id}'},
 'starred_url': {0: 'https://api.github.com/users/Digitaalhumanitaaria/starred{/owner}{/repo}'},
 'subscriptions_url': {0: 'https://api.github.com/users/Digitaalhumanitaaria/subscriptions'},
 'organizations_url': {0: 'https://api.github.com/users/Digitaalhumanitaaria/orgs'},
 'repos_url': {0: 'https://api.github.com/users/Digitaalhumanitaaria/repos'},
 'events_url': {0: 'https://ap

In [28]:
get_test_new_entities(entity_type, potential_new_entities_df, temp_entity_dir, entity_progress_bar, error_file_path)

Processing entities:   0%|          | 0/5 [00:04<?, ?it/s]

Grouping files: 100%|██████████| 1/1 [00:00<00:00, 75.18it/s]
Processing entities:  20%|██        | 1/5 [00:05<00:20,  5.23s/it]

Grouping files: 100%|██████████| 1/1 [00:00<00:00, 62.94it/s]
Processing entities:  40%|████      | 2/5 [00:05<00:07,  2.37s/it]

Grouping files: 100%|██████████| 1/1 [00:00<00:00, 66.86it/s]
Processing entities:  60%|██████    | 3/5 [00:05<00:02,  1.47s/it]

Grouping files: 100%|██████████| 1/1 [00:00<00:00, 62.76it/s]
Processing entities:  80%|████████  | 4/5 [00:06<00:01,  1.04s/it]

Grouping files: 100%|██████████| 1/1 [00:00<00:00, 91.51it/s]
Processing entities: 100%|██████████| 5/5 [00:06<00:00,  1.35s/it]


In [32]:
test = pd.read_csv("../../new_datasets/historic_data/entity_files/all_orgs/Digitaalhumanitaaria_coding_dh_orgs.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../../new_datasets/historic_data/entity_files/all_orgs/Digitaalhumanitaaria_coding_dh_orgs.csv'