# Process Initial Results

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../")
from data_generation_scripts.utils import check_rate_limit, check_add_orgs, check_add_repos, check_add_users
from data_generation_scripts.generate_expanded_search_data import get_initial_search_datasets
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile, get_total_commits
from data_generation_scripts.generate_repo_users_interactions import get_repos_user_actors
from data_generation_scripts.generate_repo_metadata import check_total_results

Once you've run `generate_expanded_search_data.py` and then `check_clean_search_results.py` you'll have a series of files in the `data/` directory that contain the results of your search. This notebook will help you process those results into a single file that can be used for analysis.

Example of how to run `generate_expanded_search_data.py`:

```python3
rates_df = check_rate_limit()
initial_repo_output_path = "../data/repo_data/"
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
repo_join_output_path = "../data/large_files/join_files/search_queries_repo_join_dataset.csv"

initial_user_output_path = "../data/user_data/"
user_output_path = "../data/entity_files/users_dataset.csv"
user_join_output_path = "../data/join_files/search_queries_user_join_dataset.csv"
load_existing_data = False
overwrite_existing_temp_files = False
org_output_path = "../data/entity_files/orgs_dataset.csv"

get_initial_search_datasets(rates_df, initial_repo_output_path,  repo_output_path, repo_join_output_path, initial_user_output_path, user_output_path, user_join_output_path, org_output_path, overwrite_existing_temp_files, load_existing_data)
```

And then just run `check_clean_search_results.py` 

### Create Initial Core Results

In [2]:
user_df = pd.read_csv("../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

Initial core datasets will be comprised of the following:

- `core_repos`: A list of all repos that were returned by the search query
- `core_users`: A list of all users that were returned by the search query
- `core_orgs`: A list of all orgs that were returned by the search query

#### Check if all items exist in entity files

In [3]:
missing_repos = search_queries_repo_join_df[~search_queries_repo_join_df.full_name.isin(repo_df.full_name)]
missing_users = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'User')]
missing_orgs = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'Organization')]

len(missing_repos), len(missing_users), len(missing_orgs)

(23, 0, 0)

In [4]:
if len(missing_repos) > 0:
    repo_df = check_add_repos(missing_repos, '../data/large_files/entity_files/repos_dataset.csv', True)
if len(missing_orgs) > 0:
    org_df = check_add_orgs(missing_orgs, '../data/entity_files/orgs_dataset.csv', True, False)
if len(missing_users) > 0:
    user_df = check_add_users(missing_users, '../data/entity_files/users_dataset.csv', True, False)

Number of repos: 344846 1685119394.0542452
Number of new repos: 0 1685119394.108879
Number of repos: 344846 1685119395.7622678 inner else statement
Number of repos: 344846, checking if older file exists 1685119395.7625248
Repo file updated 1685119414.689198


In [10]:
core_repos = pd.merge(repo_df, search_queries_repo_join_df[['full_name', 'finalized_language', 'keep_resource']], on='full_name', how='inner')
core_repos = core_repos.drop_duplicates(subset=['full_name'])
core_users = pd.merge(user_df, search_queries_user_join_df[['login', 'finalized_language', 'keep_resource']], on='login', how='inner')
core_users = core_users.drop_duplicates(subset=['login'])
core_orgs = core_users[core_users['type'] == 'Organization']
core_users = core_users[core_users['type'] == 'User']

len(core_repos), len(core_users), len(core_orgs)

(2264, 667, 126)

In [11]:
core_users.to_csv("../data/derived_files/initial_core_users.csv", index=False)
core_orgs.to_csv("../data/derived_files/initial_core_orgs.csv", index=False)
core_repos.to_csv("../data/derived_files/initial_core_repos.csv", index=False)