# Process Initial Results

In [112]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../")
from data_generation_scripts.general_utils import *
from data_generation_scripts.generate_entity_metadata import *

In [107]:
data_directory_path = get_data_directory_path()
data_directory_path

'../../new_datasets'

### Create Initial Queries Datasets and Get Entities

In [78]:
data_directory_path = "../../new_datasets"
target_terms: list = ["Public History", "Digital History", "Digital Cultural Heritage", "Cultural Analytics", "Computational Humanities", "Computational Social Science", "Digital Humanities"]

# Load in the translated terms
cleaned_terms = pd.read_csv(f'{data_directory_path}/derived_files/grouped_cleaned_translated_terms.csv', encoding='utf-8-sig')

if 'keep_term' in cleaned_terms.columns:
    cleaned_terms = cleaned_terms[cleaned_terms.keep_term == True]
# check if columns need renaming
columns_to_rename = ['code', 'term', 'term_source']
if all(elem in cleaned_terms.columns for elem in columns_to_rename):
    cleaned_terms = cleaned_terms.rename(columns={'code': 'natural_language', 'term': 'search_term', 'term_source': 'search_term_source'})
cleaned_terms = cleaned_terms[cleaned_terms.search_term_source.isin(target_terms)]
cleaned_terms = cleaned_terms.reset_index(drop=True)

cleaned_terms.loc[cleaned_terms.search_term.str.contains("&#39;"), "search_term"] = cleaned_terms.search_term.str.replace("&#39;", "'")
cleaned_terms['lower_search_term'] = cleaned_terms.search_term.str.lower()

search_user_queries_df = create_queries_directories("user", cleaned_terms)
search_org_queries_df = search_user_queries_df[search_user_queries_df['type'] == 'Organization']
search_org_queries_df = search_org_queries_df[search_org_queries_df.search_term_source.isin(cleaned_terms.search_term_source.unique())]
search_user_queries_df = search_user_queries_df[search_user_queries_df['type'] == 'User']
search_user_queries_df = search_user_queries_df[search_user_queries_df.search_term_source.isin(cleaned_terms.search_term_source.unique())]
search_repo_queries_df = create_queries_directories("repo", cleaned_terms)
search_repo_queries_df = search_repo_queries_df[search_repo_queries_df.search_term_source.isin(cleaned_terms.search_term_source.unique())]

Walking through directories: 9it [00:00, 436.49it/s]
Processing queries:   0%|          | 0/17 [00:00<?, ?it/s]

Processing queries: 100%|██████████| 17/17 [00:00<00:00, 303.18it/s]
Walking through directories: 9it [00:00, 276.01it/s]
Processing queries: 100%|██████████| 74/74 [00:00<00:00, 98.60it/s] 


In [79]:
# entity_type = "repos"
# potential_new_entities_df = search_repo_queries_df.drop_duplicates(subset=['full_name'])
# temp_entity_dir = f"{data_directory_path}/historic_data/entity_files/all_repos/"
# entity_progress_bar = tqdm(total=potential_new_entities_df.shape[0], desc="Processing entities")
# error_file_path = f"{data_directory_path}/error_logs/repo_errors.csv"
# get_new_entities(entity_type, potential_new_entities_df, temp_entity_dir, entity_progress_bar, error_file_path)

# entity_type = "orgs"
# potential_new_entities_df = search_org_queries_df.drop_duplicates(subset=['login'])
# temp_entity_dir = f"{data_directory_path}/historic_data/entity_files/all_orgs/"
# entity_progress_bar = tqdm(total=potential_new_entities_df.shape[0], desc="Processing entities")
# error_file_path = f"{data_directory_path}/error_logs/org_errors.csv"
# get_new_entities(entity_type, potential_new_entities_df, temp_entity_dir, entity_progress_bar, error_file_path)

# entity_type = "users"
# potential_new_entities_df = search_user_queries_df.drop_duplicates(subset=['login'])
# temp_entity_dir = f"{data_directory_path}/historic_data/entity_files/all_users/"
# entity_progress_bar = tqdm(total=potential_new_entities_df.shape[0], desc="Processing entities")
# error_file_path = f"{data_directory_path}/error_logs/user_errors.csv"
# get_new_entities(entity_type, potential_new_entities_df, temp_entity_dir, entity_progress_bar, error_file_path)

In [80]:
search_repo_queries_df.search_term_source.value_counts()

Digital Humanities              4018
Digital History                 1141
Computational Social Science     786
Cultural Analytics               473
Computational Humanities         354
Public History                    94
Digital Cultural Heritage         44
Name: search_term_source, dtype: int64

In [81]:
search_user_queries_df.login.nunique(), len(search_user_queries_df), search_org_queries_df.login.nunique(), len(search_org_queries_df), search_repo_queries_df.full_name.nunique(), len(search_repo_queries_df)

(1262, 1952, 246, 426, 4528, 6910)

In [82]:
user_files = os.listdir(f"{data_directory_path}/historic_data/entity_files/all_users/")
org_files = os.listdir(f"{data_directory_path}/historic_data/entity_files/all_orgs/")
repo_files = os.listdir(f"{data_directory_path}/historic_data/entity_files/all_repos/")
cleaned_user_files = [f.split("_coding_dh_")[0] for f in user_files if f.endswith(".csv")]
cleaned_org_files = [f.split("_coding_dh_")[0] for f in org_files if f.endswith(".csv")]
cleaned_repo_files = [f.split("_coding_dh_")[0].replace("_", "/", 1) for f in repo_files if f.endswith(".csv")]


In [83]:
existing_search_user_queries_df = search_user_queries_df[search_user_queries_df.login.isin(cleaned_user_files)]
existing_search_org_queries_df = search_org_queries_df[search_org_queries_df.login.isin(cleaned_org_files)]
existing_search_repo_queries_df = search_repo_queries_df[search_repo_queries_df.full_name.isin(cleaned_repo_files)]

In [84]:
existing_search_user_queries_df.login.nunique(), len(existing_search_user_queries_df), existing_search_org_queries_df.login.nunique(), len(existing_search_org_queries_df), existing_search_repo_queries_df.full_name.nunique(), len(existing_search_repo_queries_df)

(1262, 1952, 246, 426, 4525, 6907)

In [85]:
finalized_user_logins = existing_search_user_queries_df.login.unique().tolist()
finalized_org_logins = existing_search_org_queries_df.login.unique().tolist()
finalized_repo_full_names = existing_search_repo_queries_df.full_name.unique().tolist()

finalized_user_files = [f"{login}_coding_dh_user.csv" for login in finalized_user_logins]
finalized_org_files = [f"{login}_coding_dh_org.csv" for login in finalized_org_logins]
finalized_repo_files = [f"{full_name.replace('/', '_')}_coding_dh_repo.csv" for full_name in finalized_repo_full_names]

Initial core datasets will be comprised of the following:

- `core_repos`: A list of all repos that were returned by the search query
- `core_users`: A list of all users that were returned by the search query
- `core_orgs`: A list of all orgs that were returned by the search query