# Process Initial Results

In [17]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../../")
from data_generation_scripts.utils import check_rate_limit, check_add_orgs, check_add_repos, check_add_users
from data_generation_scripts.generate_expanded_search_data import get_initial_search_datasets
from data_generation_scripts.generate_repo_metadata import get_repo_languages, get_repo_labels, get_repo_tags,  get_repo_profile, get_total_commits
from data_generation_scripts.generate_repo_users_interactions import get_repos_user_actors
from data_generation_scripts.generate_repo_metadata import check_total_results

Once you've run `generate_expanded_search_data.py` and then `check_clean_search_results.py` you'll have a series of files in the `data/` directory that contain the results of your search. This notebook will help you process those results into a single file that can be used for analysis.

Example of how to run `generate_expanded_search_data.py`:

```python3
rates_df = check_rate_limit()
initial_repo_output_path = "../data/repo_data/"
repo_output_path = "../data/large_files/entity_files/repos_dataset.csv"
repo_join_output_path = "../data/large_files/join_files/search_queries_repo_join_dataset.csv"

initial_user_output_path = "../data/user_data/"
user_output_path = "../data/entity_files/users_dataset.csv"
user_join_output_path = "../data/join_files/search_queries_user_join_dataset.csv"
load_existing_data = False
overwrite_existing_temp_files = False
org_output_path = "../data/entity_files/orgs_dataset.csv"

get_initial_search_datasets(rates_df, initial_repo_output_path,  repo_output_path, repo_join_output_path, initial_user_output_path, user_output_path, user_join_output_path, org_output_path, overwrite_existing_temp_files, load_existing_data)
```

And then just run `check_clean_search_results.py` 

### Create Initial Core Results

In [4]:
user_df = pd.read_csv("../../data/entity_files/users_dataset.csv")
repo_df = pd.read_csv("../../data/large_files/entity_files/repos_dataset.csv", low_memory=False)
org_df = pd.read_csv("../../data/entity_files/orgs_dataset.csv", low_memory=False)
search_queries_repo_join_df = pd.read_csv("../../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv")
search_queries_user_join_df = pd.read_csv(
    "../../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv")

Initial core datasets will be comprised of the following:

- `core_repos`: A list of all repos that were returned by the search query
- `core_users`: A list of all users that were returned by the search query
- `core_orgs`: A list of all orgs that were returned by the search query

#### Check if all items exist in entity files

In [16]:
missing_repos = search_queries_repo_join_df[~search_queries_repo_join_df.full_name.isin(repo_df.full_name)]
missing_users = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'User')]
missing_orgs = search_queries_user_join_df[(~search_queries_user_join_df.login.isin(user_df.login)) & (search_queries_user_join_df['type'] == 'Organization')]

len(missing_repos), len(missing_users), len(missing_orgs)

(23, 0, 0)

In [18]:
if len(missing_repos) > 0:
    repo_df = check_add_repos(missing_repos, '../../data/large_files/entity_files/repos_dataset.csv', True)
if len(missing_orgs) > 0:
    org_df = check_add_orgs(missing_orgs, '../../data/entity_files/orgs_dataset.csv', True, False)
if len(missing_users) > 0:
    user_df = check_add_users(missing_users, '../../data/entity_files/users_dataset.csv', True, False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/metadata_files/repo_headers.csv'

: 

In [15]:
core_repos = repo_df[repo_df.full_name.isin(search_queries_repo_join_df.full_name.unique())]
len(core_repos), len(search_queries_repo_join_df), search_queries_repo_join_df.full_name.nunique()

(2264, 2549, 2279)

In [13]:
search_queries_repo_join_df.keep_resource.isna().sum()

342

In [11]:
search_queries_repo_join_df[(search_queries_repo_join_df.full_name.isin(core_repos.full_name.unique())) & (search_queries_repo_join_df.keep_resource != True)][['natural_language', 'finalized_language', 'keep_resource']]

Unnamed: 0,natural_language,finalized_language,keep_resource
2149,en,en,
2150,"en, ny, ha, ig, lb, mg, sm, sn, st, tl, yo",en,
2152,"en, ny, ha, ig, lb, mg, sm, sn, st, tl, yo",en,
2154,"en, ny, ha, ig, lb, mg, sm, sn, st, tl, yo",en,
2155,"en, ny, ha, ig, lb, mg, sm, sn, st, tl, yo",en,
...,...,...,...
2544,ko,ko,
2545,ko,ko,
2546,"en, ny, ha, ig, lb, mg, sm, sn, st, tl, yo",en,
2547,"en, ny, ha, ig, lb, mg, sm, sn, st, tl, yo",en,


In [5]:
subset_terms = ["Digital Humanities"]
# console = Console()
initial_repo_output_path = "../../data/repo_data/"
repo_output_path = "../../data/large_files/entity_files/repos_dataset.csv"
initial_repo_join_output_path = "../../data/large_files/join_files/search_queries_repo_join_dataset.csv"
existing_search_queries_repo_file_path = "../../data/derived_files/updated_search_queries_repo_join_subset_dh_dataset.csv"

initial_user_output_path = "../../data/user_data/"
user_output_path = "../../data/entity_files/users_dataset.csv"
org_output_path = "../../data/entity_files/orgs_dataset.csv"
initial_user_join_output_path = "../../data/join_files/search_queries_user_join_dataset.csv"
existing_search_queries_user_file_path = "../../data/derived_files/updated_search_queries_user_join_subset_dh_dataset.csv"

In [6]:
repo_join_output_path = "search_queries_repo_join_dataset.csv"
user_join_output_path = "search_queries_user_join_dataset.csv"
join_unique_field = 'search_query'
filter_fields = ['id', 'cleaned_search_query']

In [8]:
existing_search_queries_user_df = pd.read_csv(existing_search_queries_user_file_path)
existing_search_queries_repo_df = pd.read_csv(existing_search_queries_repo_file_path)

existing_search_queries_user_df = existing_search_queries_user_df[existing_search_queries_user_df.search_term_source.isin(subset_terms)]
existing_search_queries_repo_df = existing_search_queries_repo_df[existing_search_queries_repo_df.search_term_source.isin(subset_terms)]
existing_search_queries_user_df['cleaned_search_query'] = existing_search_queries_user_df['search_query'].str.replace('%22', '').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]
existing_search_queries_repo_df['cleaned_search_query'] = existing_search_queries_repo_df['search_query'].str.replace('%22', '"').str.replace('"', '').str.replace('%3A', ':').str.split('&page').str[0]

updated_search_queries_repo_df = check_for_joins_in_older_queries(repo_join_output_path, existing_search_queries_repo_df, join_unique_field, filter_fields)
updated_search_queries_user_df = check_for_joins_in_older_queries(user_join_output_path, existing_search_queries_user_df, join_unique_field, filter_fields)
# updated_search_queries_repo_df = updated_search_queries_repo_df.drop_duplicates(subset=['id', 'cleaned_search_query'])
# updated_search_queries_user_df = updated_search_queries_user_df.drop_duplicates(subset=['id', 'cleaned_search_query'])