# Process Initial Results

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../")
from data_generation_scripts.general_utils import *

In [2]:
path_temp = "../../datasets/temp"
os.path.exists(path_temp)

True

In [4]:
for dir, subdir, files in os.walk(path_temp):
    if len(subdir) == 0:
        print(dir, subdir, len(files))

Once you've run `generate_expanded_search_data.py` and then `check_clean_search_results.py` you'll have a series of files in the `data/` directory that contain the results of your search. This notebook will help you process those results into a single file that can be used for analysis.

Example of how to run `generate_expanded_search_data.py`:

```python3
rates_df = check_rate_limit()
initial_repo_output_path = "../data/repo_data/"
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
repo_join_output_path = "../data/large_files/join_files/search_queries_repo_join_dataset.csv"

initial_user_output_path = "../data/user_data/"
user_output_path = "../data/entity_files/users_dataset.csv"
user_join_output_path = "../data/join_files/search_queries_user_join_dataset.csv"
load_existing_data = False
overwrite_existing_temp_files = False
org_output_path = "../data/entity_files/orgs_dataset.csv"

get_initial_search_datasets(rates_df, initial_repo_output_path,  repo_output_path, repo_join_output_path, initial_user_output_path, user_output_path, user_join_output_path, org_output_path, overwrite_existing_temp_files, load_existing_data)
```

And then just run `check_clean_search_results.py` 

### Create Initial Core Results

In [70]:
data_directory_path = "../../datasets"
user_df = read_csv_file(f"{data_directory_path}/large_files/entity_files/users_dataset.csv")
repo_df = read_csv_file(f"{data_directory_path}/large_files/entity_files/repos_dataset.csv")
org_df = read_csv_file(f"{data_directory_path}/entity_files/orgs_dataset.csv")
search_queries_repo_join_df = read_csv_file(f"{data_directory_path}/derived_files/initial_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = read_csv_file(f"{data_directory_path}/derived_files/initial_search_queries_user_join_subset_dh_dataset.csv")

In [71]:
repo_join_output_path = f"{data_directory_path}/derived_files/initial_search_queries_repo_join_subset_dh_dataset.csv"
user_join_output_path = f"{data_directory_path}/derived_files/initial_search_queries_user_join_subset_dh_dataset.csv"

search_queries_repo_df = read_csv_file(repo_join_output_path)
search_queries_user_df = read_csv_file(user_join_output_path)



Initial core datasets will be comprised of the following:

- `core_repos`: A list of all repos that were returned by the search query
- `core_users`: A list of all users that were returned by the search query
- `core_orgs`: A list of all orgs that were returned by the search query

#### Check if all items exist in entity files

In [72]:
missing_repos = search_queries_repo_join_df[~search_queries_repo_join_df.full_name.isin(repo_df.full_name)]
missing_users = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'User')]
missing_orgs = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'Organization')]

len(missing_repos), len(missing_users), len(missing_orgs)

(0, 3, 12)

In [74]:
if len(missing_repos) > 0:
    repo_df = check_add_new_entities(missing_repos, f'{data_directory_path}/large_files/entity_files/repos_dataset.csv', 'repos', True, False)
if len(missing_orgs) > 0:
    org_df = check_add_new_entities(missing_orgs, f'{data_directory_path}/entity_files/orgs_dataset.csv', 'orgs', True, False)
if len(missing_users) > 0:
    user_df = check_add_new_entities(missing_users, f'{data_directory_path}/entity_files/users_dataset.csv', 'users', True, False)

In [75]:
core_repos = pd.merge(repo_df, search_queries_repo_join_df[['full_name', 'finalized_language', 'keep_resource']], on='full_name', how='inner')
core_repos = core_repos.drop_duplicates(subset=['full_name'])
core_users = pd.merge(user_df, search_queries_user_join_df[['login', 'finalized_language', 'keep_resource']], on='login', how='inner')
core_users = core_users.drop_duplicates(subset=['login'])
core_orgs = core_users[core_users['type'] == 'Organization']
core_users = core_users[core_users['type'] == 'User']

len(core_repos), len(core_users), len(core_orgs)

In [None]:
core_users_path = f"{data_directory_path}/derived_files/core_users_dataset.csv"
core_repos_path = f"{data_directory_path}/derived_files/core_repos_dataset.csv"
core_orgs_path = f"{data_directory_path}/derived_files/core_orgs_dataset.csv"

if os.path.exists(core_users_path):
    existing_core_users = pd.read_csv(core_users_path)

    missing_cols = [col for col in existing_core_users.columns if col not in core_users.columns]
    if len(missing_cols) > 0:
        missing_cols = missing_cols + ['login']
        added_cols = existing_core_users[missing_cols]
        core_users = pd.merge(core_users, added_cols, on='login', how='left')
        core_users = core_users.drop_duplicates(subset=['login'])

    if len(core_users) > len(existing_core_users):
        updated_core_users = core_users[~core_users.login.isin(existing_core_users.login)]
        core_users = pd.concat([existing_core_users, updated_core_users])

In [132]:
overwrite_files = False
if os.path.exists(core_repos_path) == False or overwrite_files:
    core_repos.to_csv(core_repos_path, index=False)
if os.path.exists(core_users_path) == False or overwrite_files:
    core_users.to_csv(core_users_path, index=False)
if os.path.exists(core_orgs_path) == False or overwrite_files:
    core_orgs.to_csv(core_orgs_path, index=False)