### Importing the Libraries
In this section we will be importing the required libraries to work on creating PHM calculation based on the Offline FS. 

In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import ast
import pickle
import sys
from tqdm.auto import tqdm
import os
tqdm.pandas()
from pathlib import Path
import requests
import json
import config
from datetime import datetime

import os
warnings.filterwarnings("ignore")

from ranking_utility.use_cases.historical_feature_store_accessor_use_case import (
    HistoricalFeatureStoreAccessorUseCase,
)

from ranking_utility.features.utils import calculate_historical_features, calculate_features_on_subset
from ranking_utility.features import *

import sys
sys.path.append("../learning_better_ranking/vertex_ai_models/models/gbdt_ranker_v23")
import feature_engineering 

### Initializing Features
In this section, we are creating a dictionary of features that are part of the ranking model to score the developers

In [2]:
features = [
    InterviewScore(),
    AtLeast1YOERatio(),
    MeanRelevantMCQPercentile(),
    MeanAggregateMCQPercentile(), # ["mean_mcq_pct"] < 61
    MaxSkillsYOE(),
    MaxRelevantMCQPercentile(),
    MinRelevantMCQPercentile(), # ["min_relevant_mcq_pct"] < 40
    SumRelevantProjectWords(),
    AtLeast2YOERatio(),
    AtLeast3YOERatio(),
    StartsInWeeks(),
    HourlyRate(),
    SeniorityAverage(),
    MaxAggregateMCQPercentile(),
    TIACCScore(),
    TILCIScore(),
    TIOOOScore(), # ["ti_ooo_score"] < 5
    CosineSimilarityJob(),
    CosineSimilarityQuery(),
    CosineSimilarityQueryFinetuned(),
    CosineSimilarityJobFinetuned(),
    # TIType
]

### Skill - Challenges Mapping
In this section we are retrieving all the skills with the challenge IDs which will be utilized later to get the skill keywords for the respective TupleIDs

In [3]:
def get_challenges():
    """
    returns a dict mapping skills to challenges
    """
    skill_det = pd.io.gbq.read_gbq(
    f"""
    SELECT
        mjs.skill_id,
        bas.skill_name,
        ARRAY_AGG(DISTINCT msc.challenge_id IGNORE NULLS) AS challenge_ids,
    FROM
        `raw.ms2_job_skill` mjs
    LEFT JOIN
        `raw.ms2_skill_challenges` msc
    ON
        mjs.skill_id = msc.skill_id
    LEFT JOIN
        `raw.base_all_skills_v4` bas
    ON
        mjs.skill_id = bas.id
    GROUP BY
      1,2
    """,project_id='turing-230020')
    
    challenge_map = {}
    name_map = {}
    for skill in skill_det.skill_id.values:
        challenge_map[skill] = skill_det[skill_det.skill_id == skill].challenge_ids.values[0]
        name_map[skill] = skill_det[skill_det.skill_id == skill].skill_name.values[0]

    return challenge_map, name_map

challenge_map, name_map = get_challenges()

### Forecasted Jobs
In this section we are collecting the forecasted jobs along with the respective TupleIDs and required skills. 

In [4]:
# must have skills as a list of lists
job_skills = pd.io.gbq.read_gbq(
f"""
  WITH data_ AS (
    SELECT DISTINCT
      job_id, tupleId,  skill_name, CAST(s.id AS int64) AS skill_id
    FROM `turing-230020.product_ds_supply.forecasted_tuples_q4_2023_v0` , UNNEST(SPLIT(Skill_Tuple, '|'))skill_name 
    LEFT JOIN `turing-230020.raw.base_all_skills_v4` s USING(skill_name)
  )

    SELECT
      job_id, tupleId,
      `matchingmetrics`.structurize_skills(TO_JSON_STRING(ARRAY_AGG(DISTINCT skill_id
          ORDER BY
            skill_id))) AS skills
    FROM
      data_
    /*
    WHERE  
    tupleId IN (
      SELECT 
        DISTINCT tupleId 
      FROM `turing-230020.product_ds_supply.forecasted_tuples_q4_baseline_supply` 
      WHERE Prioritized_by_Fulfillment = 'Yes'
    )
    */
    GROUP BY
      job_id, tupleId
"""
,project_id='turing-230020')
job_skills

Unnamed: 0,job_id,tupleId,skills
0,11836,11836_11,"[[114],[189]]"
1,15228,15228_247,"[[268],[2091]]"
2,12506,12506_44,"[[54],[433]]"
3,13510,13510_134,[[268]]
4,15325,15325_261,[[61]]
...,...,...,...
317,15373,15373_271,"[[69],[511],[2091]]"
318,13619,13619_147,"[[411],[559],[710]]"
319,15272,15272_254,"[[160],[1561],[1849]]"
320,11585,11585_3,"[[20],[108],[268]]"


In [5]:
#Generating Skill keywords
job_skills.skills = job_skills.skills.map(ast.literal_eval)
job_skills['num_skills'] = job_skills.skills.map(len)
job_skills['g_skills'] = job_skills.skills.apply(lambda r: [[{"skillId":s,'keyword':name_map[s]} for s in skills] for skills in sorted(r)])
job_skills['keywords'] = job_skills.skills.apply(lambda r: [name_map[s] for skill in r for s in skill])
job_skills['skill_comb'] = job_skills.g_skills.apply(lambda r: str(r))

job_skills_dict = {job['job_id']:job['skills'] for id,job in job_skills.iterrows()}

In [6]:
outcome_jobs = list(job_skills.job_id.unique())

In [7]:
#job_skills = job_skills[0:1]

### Must-Have Skill Retrieval
Based on the must-have skills, need to retrieve developers from the Phase-2 Pool.

In [8]:
#Function Parameters
debug = False
all_features = False
retrieval_size = 50000
ranker_name = "No Ranking" # use "No Ranking" in case of no ranking
p1_or_p2 = "P2"

In [9]:
tqdm.monitor_interval = 0
resp = ""

results = pd.DataFrame({'tupleId':[], 'job_id':[],'dev_id':[],'score':[]})

for id,job in tqdm(job_skills.iterrows(),total=len(job_skills)):
    with requests.Session() as s:

        if p1_or_p2 == "P1":
            payload = json.load(open('p1_retrieval_payload_psl_retrieval.json', 'r'))
        if p1_or_p2 == "P2":
            payload = json.load(open('v20_api_payload_psl_retrieval.json', 'r'))


        
        payload['skillKeywordSearch']['mustHave'] = job['g_skills']

        payload['searchAnywhereParameters']['mustHaveWords'] = job['keywords']

        payload['jobId'] = job['job_id']#.astype(str)
        payload['hideACCCheater'] = True


        payload['featureFlags']['matching-ranker'] = ranker_name
        payload['pageSize'] = retrieval_size        
        payload['debug'] = debug
        

        json_payload = json.dumps(payload)
        content_length = str(len(json_payload))

        headers = {
            'Content-Type': 'application/json',
            'Content-Length': content_length,
            'authorization': config.bearer_token
        }

        resp = s.post(config.url, json_payload, headers = headers)
        try:
            
            if debug == False:
                resp_df = pd.DataFrame(json.loads(json.dumps(resp.json()['developers'])))
                resp_df.rename(columns = {'score' : 'score', 'userId':'dev_id'}, inplace = True)
                resp_df['dev_id'] = resp_df['dev_id'].astype(int)
                resp_df['score'] = resp_df['score'].astype(float)

            else:
                if all_features:
                    resp_df = pd.DataFrame([values for id, values in resp.json()['debug']['feature_df'].items()])
                    resp_df["dev_id"] = [id for id, values in resp.json()['debug']['feature_df'].items()]
                    resp_df.rename(columns = {'modelScore' : 'score'}, inplace = True)
                    


                else:
                    resp_df = pd.DataFrame([(id, values['modelScore']) for id, values in resp.json()['debug']['feature_df'].items()], columns=['dev_id', 'modelScore']) 
                    resp_df.rename(columns = {'modelScore' : 'score'}, inplace = True)
                    resp_df['dev_id'] = resp_df['dev_id'].astype(int)
                    resp_df['score'] = resp_df['score'].astype(float)
                
           


            resp_df['job_id'] = job['job_id']
            resp_df['tupleId'] = job['tupleId']
            
            #results = results.append(resp_df)
            results = pd.concat([results, resp_df], ignore_index=True)
            print(f"job: {job['job_id']} : Done")
        except Exception as e:
            print(e)
            print(f"job: {job['job_id']} : Error - {resp.status_code}: {resp.text}")
            continue
results.shape

  0%|          | 0/322 [00:00<?, ?it/s]

job: 11836 : Done
job: 15228 : Done
job: 12506 : Done
job: 13510 : Done
job: 15325 : Done
job: 12030 : Done
job: 12842 : Done
job: 12766 : Done
job: 15278 : Done
job: 15541 : Done
job: 12622 : Done
job: 12044 : Done
job: 15258 : Done
job: 12849 : Done
job: 14986 : Done
job: 15165 : Done
job: 13666 : Done
job: 11746 : Done
job: 11825 : Done
job: 12753 : Done
job: 12458 : Done
job: 12031 : Done
job: 14996 : Done
job: 15047 : Done
job: 11940 : Done
job: 15105 : Done
job: 12773 : Done
job: 15316 : Done
job: 12036 : Done
job: 12585 : Done
job: 15542 : Done
job: 12045 : Done
job: 15174 : Done
job: 13374 : Done
job: 13016 : Done
job: 14030 : Done
job: 12236 : Done
job: 13717 : Done
job: 13367 : Done
job: 14618 : Done
job: 14685 : Done
job: 12851 : Done
job: 15679 : Done
job: 14665 : Done
job: 13226 : Done
job: 15499 : Done
job: 13227 : Done
job: 15501 : Done
job: 13045 : Done
job: 13562 : Done
job: 13193 : Done
job: 11863 : Done
job: 15138 : Done
job: 15191 : Done
job: 12954 : Done
job: 15029

(2598119, 4)

In [10]:
results.head(5)

Unnamed: 0,tupleId,job_id,dev_id,score
0,11836_11,11836.0,3784659.0,146.63704
1,11836_11,11836.0,1287973.0,145.7633
2,11836_11,11836.0,1868788.0,145.51804
3,11836_11,11836.0,3555220.0,141.64862
4,11836_11,11836.0,2060045.0,134.48859


In [11]:
results['job_id'] = results['job_id'].astype(int)
results['developer_id'] = results['dev_id'].astype(int)
results['score'] = results['score'].astype(float)

results.head()

Unnamed: 0,tupleId,job_id,dev_id,score,developer_id
0,11836_11,11836,3784659.0,146.63704,3784659
1,11836_11,11836,1287973.0,145.7633,1287973
2,11836_11,11836,1868788.0,145.51804,1868788
3,11836_11,11836,3555220.0,141.64862,3555220
4,11836_11,11836,2060045.0,134.48859,2060045


In [12]:
results.shape

(2598119, 5)

In [13]:
backfill_date = '2023-11-29'
results['Extraction_Date'] = backfill_date
# Need to add Extraction date column in results @Bhavesh

In [14]:
results.head(5)

Unnamed: 0,tupleId,job_id,dev_id,score,developer_id,Extraction_Date
0,11836_11,11836,3784659.0,146.63704,3784659,2023-11-29
1,11836_11,11836,1287973.0,145.7633,1287973,2023-11-29
2,11836_11,11836,1868788.0,145.51804,1868788,2023-11-29
3,11836_11,11836,3555220.0,141.64862,3555220,2023-11-29
4,11836_11,11836,2060045.0,134.48859,2060045,2023-11-29


In [15]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2598119 entries, 0 to 2598118
Data columns (total 6 columns):
 #   Column           Dtype  
---  ------           -----  
 0   tupleId          object 
 1   job_id           int64  
 2   dev_id           float64
 3   score            float64
 4   developer_id     int64  
 5   Extraction_Date  object 
dtypes: float64(2), int64(2), object(2)
memory usage: 118.9+ MB


In [16]:
all_outcomes = results.copy()
all_outcomes = job_skills.merge(all_outcomes,on=['tupleId', 'job_id'],how='left').fillna(0)
all_outcomes = all_outcomes.drop_duplicates(subset=['Extraction_Date', 'tupleId', 'job_id','developer_id'])
results_f = all_outcomes[['Extraction_Date', 'tupleId', 'job_id','developer_id']]
results_f.shape

(2598122, 4)

In [17]:
all_outcomes = results_f
all_outcomes['developer_id'] = all_outcomes['developer_id'].astype(int)
all_outcomes.head()

Unnamed: 0,Extraction_Date,tupleId,job_id,developer_id
0,2023-11-29,11836_11,11836,3784659
1,2023-11-29,11836_11,11836,1287973
2,2023-11-29,11836_11,11836,1868788
3,2023-11-29,11836_11,11836,3555220
4,2023-11-29,11836_11,11836,2060045


In [18]:
all_outcomes.shape

(2598122, 4)

### Build Data-Set for the Offline Feature Store
Need to build the above data-set, which can be used to generate the required features using the Offline FS.

In [19]:
# Ensure we found skills for the jobs we are looking at here
all_outcomes = all_outcomes[all_outcomes.job_id.isin(job_skills.job_id.unique())]

In [20]:
all_outcomes['Extraction_Date'] = all_outcomes['Extraction_Date'].astype(str)

In [21]:
all_outcomes['developer_id'] = all_outcomes['developer_id'].astype(int)

In [22]:
def get_outcomes_dataset(jd_df):
    jd_dict = {'tupleId':[], 'job_id':[],'developer_id':[],'feature_lookup_date':[],'skill_ids':[],'challenge_ids':[],'keywords':[],'timestamp':[]}

    for index, jd in tqdm(jd_df.iterrows(),total=len(jd_df)):
        skills = [set(s) for s in job_skills_dict[jd['job_id']]]

        jd_dict['job_id'].append(jd['job_id'])
        jd_dict['tupleId'].append(jd['tupleId'])
        jd_dict['developer_id'].append(jd['developer_id'])
        
        # change the timestamp here
        jd_dict['feature_lookup_date'] = jd_df['Extraction_Date']
#        jd_dict['feature_lookup_date'].append(pd.to_datetime(datetime.datetime.strptime(jd_df['Extraction_Date'], "%Y-%m-%d")))
        
        jd_dict['skill_ids'].append(skills)
        
        challenges = [[c for s in skill for c in challenge_map[s]] for skill in skills]
        challenges = [set(c) for c in challenges if len(c) > 0]
        jd_dict['challenge_ids'].append(challenges)
        
        jd_dict['keywords'].append([[name_map[s] for s in skill] for skill in skills])
        
        # change the timestamp here
        jd_dict['timestamp'] = jd_df['Extraction_Date']
#        jd_dict['timestamp'].append(pd.to_datetime(datetime.datetime.strptime(jd_df['Extraction_Date'], "%Y-%m-%d")))

    jd_df = pd.DataFrame(jd_dict)
    return jd_df

In [23]:
outcome_pairs_df = get_outcomes_dataset(all_outcomes)

  0%|          | 0/2598122 [00:00<?, ?it/s]

In [24]:
outcome_pairs_df.shape

(2598122, 8)

In [25]:
outcome_pairs_df.head()

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp
0,11836_11,11836,3784659,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29
1,11836_11,11836,1287973,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29
2,11836_11,11836,1868788,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29
3,11836_11,11836,3555220,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29
4,11836_11,11836,2060045,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29


In [26]:
outcome_pairs_df.dtypes

tupleId                object
job_id                  int64
developer_id            int64
feature_lookup_date    object
skill_ids              object
challenge_ids          object
keywords               object
timestamp              object
dtype: object

In [27]:
import pandas_gbq
import time

fdf = None


class VAIFeatureGenerator:
    def __init__(self):
        self.accessor = HistoricalFeatureStoreAccessorUseCase()

    def __validate_df(self, df):
        for col in ["job_id", "developer_id", "timestamp", "skill_ids", "challenge_ids", "keywords"]:
            assert col in df.columns, f"{col} not present in df"

    def generate_features(self, df, features):

        self.__validate_df(df)


        start=time.time()
        
        response = self.accessor.fetch_features(
            pairs=df,
            features=features,
        )

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            feature_df = calculate_historical_features(
                data_df=df.copy(),
                features=features,
                feature_store_features=response,
            )

        print("Time taken", time.time() - start)

        return feature_df


    def generate_features_batch_ameer(self, df, features, batch_size=100000):
        num_batches = len(df) // batch_size + 1
        csv_file = 'features_hsitorical.csv'
        batch_file = 'last_batch.txt'
        if os.path.exists(batch_file):
            with open(batch_file, 'r') as f:
                start_batch = int(f.read())
        else:
            start_batch = 0
            
        for i in range(start_batch, num_batches):
            print("Current Batch: "+str(i+1)+'/'+ str(num_batches))
            start = i * batch_size
            end = (i + 1) * batch_size if (i + 1) * batch_size < len(df) else len(df)
            df_batch = df.iloc[start:end]

            result = df_batch.pipe(self.generate_features, features=features)
            result.to_csv(csv_file, mode='a', index=False, header=i==0)
            with open(batch_file, 'w') as f:
                f.write(str(i))
                
    
    def generate_features_batch(self, df, features, batch_size=100000):

        nbatches = int(np.ceil(df.shape[0] / batch_size))
        print("Number of batches :", nbatches)
        feature_df = pd.DataFrame()

        for i in tqdm(list(range(nbatches))):
            batch_df = df.head((i + 1) * batch_size).tail(batch_size)
            batch_feat = batch_df.pipe(self.generate_features, features=features)
            #batch_feat = feature_engineering.post_process(batch_feat)
            feature_df = pd.concat([feature_df, batch_feat])
            print("Batch ", i + 1, ":  Done")

        return feature_df
    
    
    def generate_features_batch_csv(self, df, features, batch_size=40000):

        nbatches = int(np.ceil(df.shape[0] / batch_size))
        print("Number of batches :", nbatches)
        feature_df = pd.DataFrame()

        for i in tqdm(list(range(nbatches))):
            start=time.time()
            batch_df = df.head((i + 1) * batch_size).tail(batch_size)
            batch_feat = batch_df.pipe(self.generate_features, features=features)
            batch_feat = feature_engineering.post_process(batch_feat)
            fdf = batch_feat[[
                        'feature_lookup_date', 'job_id','developer_id', 
                        'interview_score', 'atleast_1yoe_ratio', 'mean_relevant_mcq_pct',
                        'mean_mcq_pct', 'max_skill_yoe', 'max_relevant_mcq_pct',
                        'min_relevant_mcq_pct', 'sum_relevant_proj_words', 'atleast_2yoe_ratio',
                        'atleast_3yoe_ratio', 'starts_in_weeks', 'hourly_rate', 'seniority_avg',
                        'max_mcq_pct', 'ti_acc_score', 'ti_lci_score', 'ti_ooo_score',
                        'cosine_similarity_job', 'cosine_similarity_query',
                        'cosine_similarity_query_ft', 'cosine_similarity_job_ft'
            ]]
            #feature_df = pd.concat([feature_df, batch_feat])
            fdf.to_csv("/hfeatures/file_"+i+".csv")
            print("Batch ", i + 1, ":  Done | Time taken", time.time() - start)

        return feature_df


    def generate_features_batch_bq(self, df, features, batch_size=100000):

        project_id = 'turing-dev-337819'
        dataset_id = 'product_ds_supply'
        table_id = 'temp_' + datetime.strptime(backfill_date, "%Y-%m-%d").strftime("%d_%m_%Y") + '_forecasted_q4_v0_FS'

        nbatches = int(np.ceil(df.shape[0] / batch_size))
        print("Number of batches :", nbatches)
        feature_df = pd.DataFrame()

        for i in tqdm(list(range(nbatches))):
            start=time.time()
            batch_df = df.head((i + 1) * batch_size).tail(batch_size)
            batch_feat = batch_df.pipe(self.generate_features, features=features)
            print("Batch ", i + 1, ":  S-PP | Time taken", time.time() - start)
            #batch_feat = feature_engineering.post_process(batch_feat)
            fdf = batch_feat[[
                        'feature_lookup_date', 'tupleId','job_id','developer_id', 
                        'interview_score', 'atleast_1yoe_ratio', 'mean_relevant_mcq_pct',
                        'mean_mcq_pct', 'max_skill_yoe', 'max_relevant_mcq_pct',
                        'min_relevant_mcq_pct', 'sum_relevant_proj_words', 'atleast_2yoe_ratio',
                        'atleast_3yoe_ratio', 'starts_in_weeks', 'hourly_rate', 'seniority_avg',
                        'max_mcq_pct', 'ti_acc_score', 'ti_lci_score', 'ti_ooo_score',
                        'cosine_similarity_job', 'cosine_similarity_query',
                        'cosine_similarity_query_ft', 'cosine_similarity_job_ft'
            ]]
            #feature_df = pd.concat([feature_df, batch_feat])

            fdf['tupleId'] = fdf['tupleId'].astype(str)
            fdf['job_id'] = fdf['job_id'].astype(int)
            fdf['developer_id'] = fdf['developer_id'].astype(int)
            fdf['interview_score'] = fdf['interview_score'].astype(str)
            fdf['atleast_1yoe_ratio'] = fdf['atleast_1yoe_ratio'].astype(str)
            fdf['mean_relevant_mcq_pct'] = fdf['mean_relevant_mcq_pct'].astype(str)
            fdf['mean_mcq_pct'] = fdf['mean_mcq_pct'].astype(str)
            fdf['max_skill_yoe'] = fdf['max_skill_yoe'].astype(str)
            fdf['max_relevant_mcq_pct'] = fdf['max_relevant_mcq_pct'].astype(str)
            fdf['min_relevant_mcq_pct'] = fdf['min_relevant_mcq_pct'].astype(str)
            fdf['sum_relevant_proj_words'] = fdf['sum_relevant_proj_words'].astype(str)
            fdf['atleast_2yoe_ratio'] = fdf['atleast_2yoe_ratio'].astype(str)
            fdf['atleast_3yoe_ratio'] = fdf['atleast_3yoe_ratio'].astype(str)
            fdf['starts_in_weeks'] = fdf['starts_in_weeks'].astype(str)
            fdf['hourly_rate'] = fdf['hourly_rate'].astype(str)
            fdf['seniority_avg'] = fdf['seniority_avg'].astype(str)
            fdf['max_mcq_pct'] = fdf['max_mcq_pct'].astype(str)
            fdf['ti_acc_score'] = fdf['ti_acc_score'].astype(str)
            fdf['ti_lci_score'] = fdf['ti_lci_score'].astype(str)
            fdf['ti_ooo_score'] = fdf['ti_ooo_score'].astype(str)
            fdf['cosine_similarity_job'] = fdf['cosine_similarity_job'].astype(str)
            fdf['cosine_similarity_query'] = fdf['cosine_similarity_query'].astype(str)
            fdf['cosine_similarity_query_ft'] = fdf['cosine_similarity_query_ft'].astype(str)
            fdf['cosine_similarity_job_ft'] = fdf['cosine_similarity_job_ft'].astype(str)
            

            pandas_gbq.to_gbq(fdf, f'{dataset_id}.{table_id}', project_id=project_id, if_exists='append')
            print("Batch ", i + 1, ":  Done | Time taken", time.time() - start)

        return feature_df

In [28]:
outcome_pairs_df.head()

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp
0,11836_11,11836,3784659,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29
1,11836_11,11836,1287973,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29
2,11836_11,11836,1868788,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29
3,11836_11,11836,3555220,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29
4,11836_11,11836,2060045,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-29


In [29]:
from datetime import datetime, timezone
outcome_pairs_df['timestamp'] = pd.to_datetime(datetime.now()).tz_localize(timezone.utc)

In [30]:
outcome_pairs_df.head()

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp
0,11836_11,11836,3784659,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00
1,11836_11,11836,1287973,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00
2,11836_11,11836,1868788,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00
3,11836_11,11836,3555220,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00
4,11836_11,11836,2060045,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00


In [31]:
# No travel back, you can change timestamp to anything you want as long as you work within 2023

objectFeatures = VAIFeatureGenerator()
outcome_features = objectFeatures.generate_features_batch(outcome_pairs_df, features)

Number of batches : 26


  0%|          | 0/26 [00:00<?, ?it/s]

11-30-2023 12:22:47 - HISTORICAL_FEATURE_STORE_ACCESSOR - INFO :: Started fetching features for entity: developer
11-30-2023 12:22:47 - HISTORICAL_FEATURE_STORE_ACCESSOR - INFO :: Started fetching features for entity: job
11-30-2023 12:22:47 - HISTORICAL_FEATURE_STORE_ACCESSOR - INFO :: Started fetching features for entity: query
11-30-2023 12:22:56 - BQ_FEATURE_GENERATOR - INFO :: Total bytes to be processed: 261813034
11-30-2023 12:22:57 - BQ_FEATURE_GENERATOR - INFO :: Total bytes to be processed: 3416880826
11-30-2023 12:22:58 - BQ_FEATURE_GENERATOR - INFO :: Total bytes to be processed: 525206200723
11-30-2023 12:23:03 - HISTORICAL_FEATURE_STORE_ACCESSOR - INFO :: Finished fetching features for entity: job
11-30-2023 12:23:04 - HISTORICAL_FEATURE_STORE_ACCESSOR - INFO :: Finished fetching features for entity: query
11-30-2023 12:24:40 - HISTORICAL_FEATURE_STORE_ACCESSOR - INFO :: Finished fetching features for entity: developer
11-30-2023 12:24:42 - HISTORICAL_FEATURE_STORE_ACCESS

In [34]:
outcome_features

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp,interview_score,atleast_1yoe_ratio,...,hourly_rate,seniority_avg,max_mcq_pct,ti_acc_score,ti_lci_score,ti_ooo_score,cosine_similarity_job,cosine_similarity_query,cosine_similarity_query_ft,cosine_similarity_job_ft
0,11836_11,11836,387,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,,2.856666,69.714711,7.0,,,-0.884535,-0.484625,-0.038759,0.090793
1,11836_11,11836,2499,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,5.0,1.0,...,6.24299,3.800834,100.000000,5.0,,,-0.902550,-0.523804,0.178803,0.280685
2,11836_11,11836,2883,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,5.0,1.0,...,19.11900,3.533334,78.188259,5.0,,,-0.908211,-0.545369,0.209851,0.326563
3,11836_11,11836,4389,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,10.0,1.0,...,12.40620,4.490000,100.000000,10.0,,,-0.800924,-0.406972,-0.021816,0.204157
4,11836_11,11836,5362,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,45.00000,4.057500,98.513149,,7.0,,-0.901845,-0.519246,0.011571,0.184234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,15609_314,15609,3825452,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,,0.0,...,,3.545000,87.892108,,,,,,,
99996,15609_314,15609,3827630,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,8.0,0.0,...,,4.345834,91.318336,8.0,,,,,,
99997,15609_314,15609,3827821,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,5.0,0.0,...,,3.522500,63.640480,5.0,,,,,,
99998,15609_314,15609,3829566,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,5.0,0.0,...,,4.300834,93.899486,5.0,,,,,,


In [35]:
outcome_pairs_df.head()

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp
0,11836_11,11836,3784659,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00
1,11836_11,11836,1287973,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00
2,11836_11,11836,1868788,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00
3,11836_11,11836,3555220,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00
4,11836_11,11836,2060045,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00


In [36]:
outcome_features.dtypes

tupleId                                    object
job_id                                      int64
developer_id                                int64
feature_lookup_date                        object
skill_ids                                  object
challenge_ids                              object
keywords                                   object
timestamp                     datetime64[ns, UTC]
interview_score                           float64
atleast_1yoe_ratio                        float64
mean_relevant_mcq_pct                     float64
mean_mcq_pct                              float64
max_skill_yoe                             float64
max_relevant_mcq_pct                      float64
min_relevant_mcq_pct                      float64
sum_relevant_proj_words                     int32
atleast_2yoe_ratio                        float64
atleast_3yoe_ratio                        float64
starts_in_weeks                           float64
hourly_rate                               float64


In [37]:
fdf = outcome_features.copy()


fdf['tupleId'] = fdf['tupleId'].astype(str)
fdf['job_id'] = fdf['job_id'].astype(int)
fdf['developer_id'] = fdf['developer_id'].astype(int)
fdf['interview_score'] = fdf['interview_score'].astype(float)
fdf['atleast_1yoe_ratio'] = fdf['atleast_1yoe_ratio'].astype(float)
fdf['mean_relevant_mcq_pct'] = fdf['mean_relevant_mcq_pct'].astype(float)
fdf['mean_mcq_pct'] = fdf['mean_mcq_pct'].astype(float)
fdf['max_skill_yoe'] = fdf['max_skill_yoe'].astype(float)
fdf['max_relevant_mcq_pct'] = fdf['max_relevant_mcq_pct'].astype(float)
fdf['min_relevant_mcq_pct'] = fdf['min_relevant_mcq_pct'].astype(float)
fdf['sum_relevant_proj_words'] = fdf['sum_relevant_proj_words'].astype(float)
fdf['atleast_2yoe_ratio'] = fdf['atleast_2yoe_ratio'].astype(float)
fdf['atleast_3yoe_ratio'] = fdf['atleast_3yoe_ratio'].astype(float)
fdf['starts_in_weeks'] = pd.to_numeric(fdf['starts_in_weeks'], errors='coerce').astype(float)
fdf['hourly_rate'] = fdf['hourly_rate'].astype(float)
fdf['seniority_avg'] = fdf['seniority_avg'].astype(float)
fdf['max_mcq_pct'] = fdf['max_mcq_pct'].astype(float)
fdf['ti_acc_score'] = fdf['ti_acc_score'].astype(float)
fdf['ti_lci_score'] = fdf['ti_lci_score'].astype(float)
fdf['ti_ooo_score'] = fdf['ti_ooo_score'].astype(float)
fdf['cosine_similarity_job'] = fdf['cosine_similarity_job'].astype(float)
fdf['cosine_similarity_query'] = fdf['cosine_similarity_query'].astype(float)
fdf['cosine_similarity_query_ft'] = fdf['cosine_similarity_query_ft'].astype(float)
fdf['cosine_similarity_job_ft'] = fdf['cosine_similarity_job_ft'].astype(float)


outcome_features = fdf.copy()

In [38]:
outcome_features.columns

Index(['tupleId', 'job_id', 'developer_id', 'feature_lookup_date', 'skill_ids',
       'challenge_ids', 'keywords', 'timestamp', 'interview_score',
       'atleast_1yoe_ratio', 'mean_relevant_mcq_pct', 'mean_mcq_pct',
       'max_skill_yoe', 'max_relevant_mcq_pct', 'min_relevant_mcq_pct',
       'sum_relevant_proj_words', 'atleast_2yoe_ratio', 'atleast_3yoe_ratio',
       'starts_in_weeks', 'hourly_rate', 'seniority_avg', 'max_mcq_pct',
       'ti_acc_score', 'ti_lci_score', 'ti_ooo_score', 'cosine_similarity_job',
       'cosine_similarity_query', 'cosine_similarity_query_ft',
       'cosine_similarity_job_ft'],
      dtype='object')

In [39]:
outcome_features.head(5)

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp,interview_score,atleast_1yoe_ratio,...,hourly_rate,seniority_avg,max_mcq_pct,ti_acc_score,ti_lci_score,ti_ooo_score,cosine_similarity_job,cosine_similarity_query,cosine_similarity_query_ft,cosine_similarity_job_ft
0,11836_11,11836,387,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,,2.856666,69.714711,7.0,,,-0.884535,-0.484625,-0.038759,0.090793
1,11836_11,11836,2499,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,5.0,1.0,...,6.24299,3.800834,100.0,5.0,,,-0.90255,-0.523804,0.178803,0.280685
2,11836_11,11836,2883,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,5.0,1.0,...,19.119,3.533334,78.188259,5.0,,,-0.908211,-0.545369,0.209851,0.326563
3,11836_11,11836,4389,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,10.0,1.0,...,12.4062,4.49,100.0,10.0,,,-0.800924,-0.406972,-0.021816,0.204157
4,11836_11,11836,5362,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,45.0,4.0575,98.513149,,7.0,,-0.901845,-0.519246,0.011571,0.184234


In [40]:
all_outcomes.rename(columns = {'Extraction_Date' : 'feature_lookup_date'}, inplace = True)
all_outcomes = all_outcomes.merge(outcome_features,on=['job_id','developer_id','feature_lookup_date'],how='left')
all_outcomes.shape

(2775758, 30)

In [41]:
all_outcomes = outcome_features.copy()

In [42]:
all_outcomes = all_outcomes.drop_duplicates(subset=["feature_lookup_date" ,"developer_id", "job_id", "tupleId"])
all_outcomes.shape

(2598122, 29)

### Post-Processing
Generate the ranking score for the respective TupleID & Developer ID pairs.

In [43]:
from ranking_utility.config import PredictionType
all_outcomes = feature_engineering.post_process(all_outcomes, prediction_type=PredictionType.OFFLINE)

In [44]:
outcome_features["sum_relevant_proj_words"] = outcome_features["sum_relevant_proj_words"].astype(float)

In [45]:
all_outcomes = outcome_features.copy()

### Model: HM Discernment

In [46]:
model_gbdtv23 = None
threshold = 0.000646
with open(
    "../learning_better_ranking/vertex_ai_models/models/gbdt_ranker_v23/search2start_gbdt_v23.pkl",
    "rb",
) as fp:
    model_gbdtv23 = pickle.load(fp)

In [47]:
features_columns = ['starts_in_weeks',
 'hourly_rate',
 'atleast_2yoe_ratio',
 'mean_mcq_pct',
 'max_skill_yoe',
 'atleast_3yoe_ratio',
 'seniority_avg',
 'min_relevant_mcq_pct',
 'max_relevant_mcq_pct',
 'sum_relevant_proj_words',
 'atleast_1yoe_ratio',
 'max_mcq_pct',
 'ti_ooo_score',
 'ti_lci_score',
 'ti_acc_score',
 'cosine_similarity_query_ft',
 'cosine_similarity_job_ft']

In [48]:
all_outcomes[f'model_ttf_score'] = model_gbdtv23.predict(all_outcomes[features_columns])

all_outcomes['model_restrict'] =(
        (all_outcomes.ti_ooo_score < 5)
        | (all_outcomes.mean_mcq_pct < 61)
        | (all_outcomes.min_relevant_mcq_pct < 40)
)

all_outcomes['model_backfill_score'] =  np.where(all_outcomes['model_restrict'] == False, all_outcomes['model_ttf_score'], -1)
all_outcomes['hm_satisfy'] =  np.where(all_outcomes['model_backfill_score'] >= threshold, 1, 0)
all_outcomes

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp,interview_score,atleast_1yoe_ratio,...,ti_lci_score,ti_ooo_score,cosine_similarity_job,cosine_similarity_query,cosine_similarity_query_ft,cosine_similarity_job_ft,model_ttf_score,model_restrict,model_backfill_score,hm_satisfy
0,11836_11,11836,387,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,,,-0.884535,-0.484625,-0.038759,0.090793,0.000006,False,0.000006,0
1,11836_11,11836,2499,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,5.0,1.0,...,,,-0.902550,-0.523804,0.178803,0.280685,0.000513,True,-1.000000,0
2,11836_11,11836,2883,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,5.0,1.0,...,,,-0.908211,-0.545369,0.209851,0.326563,0.000387,True,-1.000000,0
3,11836_11,11836,4389,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,10.0,1.0,...,,,-0.800924,-0.406972,-0.021816,0.204157,0.000132,False,0.000132,0
4,11836_11,11836,5362,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,7.0,,-0.901845,-0.519246,0.011571,0.184234,0.000540,False,0.000540,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,15609_314,15609,3825452,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,,0.0,...,,,,,,,0.000085,False,0.000085,0
99996,15609_314,15609,3827630,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,8.0,0.0,...,,,,,,,0.000052,False,0.000052,0
99997,15609_314,15609,3827821,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,5.0,0.0,...,,,,,,,0.000043,True,-1.000000,0
99998,15609_314,15609,3829566,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,5.0,0.0,...,,,,,,,0.000117,False,0.000117,0


### Hourly Rate Imputation
Impute hourly rate for Non-HM developers who have 'NAN' as an input for this feature.

In [49]:
nhm = all_outcomes[all_outcomes.model_backfill_score < threshold]
nhm = nhm[nhm.hourly_rate.isna()]

In [50]:
nhm

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp,interview_score,atleast_1yoe_ratio,...,ti_lci_score,ti_ooo_score,cosine_similarity_job,cosine_similarity_query,cosine_similarity_query_ft,cosine_similarity_job_ft,model_ttf_score,model_restrict,model_backfill_score,hm_satisfy
0,11836_11,11836,387,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,,,-0.884535,-0.484625,-0.038759,0.090793,0.000006,False,0.000006,0
7,11836_11,11836,11172,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,,,-0.887280,-0.484665,0.143015,0.423837,0.000057,True,-1.000000,0
22,11836_11,11836,63692,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,10.0,1.0,...,,,0.908356,0.590344,-0.045763,0.351010,0.000046,False,0.000046,0
29,11836_11,11836,109111,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,6.0,1.0,...,,,-0.893872,-0.498885,-0.231007,-0.266892,0.000017,False,0.000017,0
33,11836_11,11836,116386,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,8.0,0.5,...,8.0,,-0.895648,-0.511537,0.171444,0.322384,0.000040,False,0.000040,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,15609_314,15609,3825452,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,,0.0,...,,,,,,,0.000085,False,0.000085,0
99996,15609_314,15609,3827630,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,8.0,0.0,...,,,,,,,0.000052,False,0.000052,0
99997,15609_314,15609,3827821,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,5.0,0.0,...,,,,,,,0.000043,True,-1.000000,0
99998,15609_314,15609,3829566,2023-11-29,[{16}],[{135}],[[iOS Development]],2023-11-30 12:22:38.916709+00:00,5.0,0.0,...,,,,,,,0.000117,False,0.000117,0


In [51]:
all_outcomes = nhm.copy()
all_outcomes.rename(columns = {'hourly_rate' : 'old_hourly_rate'}, inplace = True)
all_outcomes.shape

(1080457, 33)

### Dev Rate Card
We will be using the dev rate for imputing the rate

In [52]:
dev_geo_yoe = pd.io.gbq.read_gbq(
f"""

  With data_final as (select a.user_id as developer_id, b.role_type, c.country_group,
  case when a.years_of_experience < 3 then '[0-3)'
      when a.years_of_experience < 5 then '[3-5)'
      when a.years_of_experience < 7 then '[5-7)'
      when a.years_of_experience < 10 then '[7-10)'
      when a.years_of_experience < 14 then '[10-14)'
      Else '[14,+)' end as yoe_bucket
  from `turing-230020.raw.developer_detail` a
  left join `turing-230020.raw.ms2_job_role_type` b on a.role_type_id = b.id
  left join `turing-230020.curated.country_information` c on a.country_id = c.country_id
  )

  select a.developer_id, b.ub as hourly_rate from data_final a
  left join `turing-dev-337819.data_science.rate_card_v3_role_p12` b USING(country_group,role_type,yoe_bucket)
  where a.developer_id in ({', '.join(map(str, list(all_outcomes.developer_id.unique())))})
"""
,project_id='turing-230020')
dev_geo_yoe

Unnamed: 0,developer_id,hourly_rate
0,1235461,28.516670
1,1322574,20.602424
2,1409184,43.256821
3,2241742,20.815724
4,2411708,45.436427
...,...,...
40456,2859459,25.594702
40457,2493648,25.594702
40458,2558574,25.594702
40459,3656789,25.594702


In [53]:
all_outcomes.head()

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,skill_ids,challenge_ids,keywords,timestamp,interview_score,atleast_1yoe_ratio,...,ti_lci_score,ti_ooo_score,cosine_similarity_job,cosine_similarity_query,cosine_similarity_query_ft,cosine_similarity_job_ft,model_ttf_score,model_restrict,model_backfill_score,hm_satisfy
0,11836_11,11836,387,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,,,-0.884535,-0.484625,-0.038759,0.090793,6e-06,False,6e-06,0
7,11836_11,11836,11172,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,7.0,1.0,...,,,-0.88728,-0.484665,0.143015,0.423837,5.7e-05,True,-1.0,0
22,11836_11,11836,63692,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,10.0,1.0,...,,,0.908356,0.590344,-0.045763,0.35101,4.6e-05,False,4.6e-05,0
29,11836_11,11836,109111,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,6.0,1.0,...,,,-0.893872,-0.498885,-0.231007,-0.266892,1.7e-05,False,1.7e-05,0
33,11836_11,11836,116386,2023-11-29,"[{114}, {189}]","[{128, 109, 432}, {565}]","[[SQL], [Tableau]]",2023-11-30 12:22:38.916709+00:00,8.0,0.5,...,8.0,,-0.895648,-0.511537,0.171444,0.322384,4e-05,False,4e-05,0


In [54]:
all_outcomes = all_outcomes.merge(dev_geo_yoe[['developer_id', 'hourly_rate']], on='developer_id', how='left')

In [55]:
all_outcomes['job_id'] = all_outcomes['job_id'].astype(int)
all_outcomes['developer_id'] = all_outcomes['developer_id'].astype(int)

all_outcomes['starts_in_weeks'] = all_outcomes['starts_in_weeks'].astype(float)
all_outcomes['hourly_rate'] = all_outcomes['hourly_rate'].astype(float)
all_outcomes['atleast_2yoe_ratio'] = all_outcomes['atleast_2yoe_ratio'].astype(float)
all_outcomes['mean_mcq_pct'] = all_outcomes['mean_mcq_pct'].astype(float)
all_outcomes['max_skill_yoe'] = all_outcomes['max_skill_yoe'].astype(float)
all_outcomes['atleast_3yoe_ratio'] = all_outcomes['atleast_3yoe_ratio'].astype(float)
all_outcomes['seniority_avg'] = all_outcomes['seniority_avg'].astype(float)
all_outcomes['min_relevant_mcq_pct'] = all_outcomes['min_relevant_mcq_pct'].astype(float)
all_outcomes['max_relevant_mcq_pct'] = all_outcomes['max_relevant_mcq_pct'].astype(float)
all_outcomes['sum_relevant_proj_words'] = all_outcomes['sum_relevant_proj_words'].astype(float)
all_outcomes['atleast_1yoe_ratio'] = all_outcomes['atleast_1yoe_ratio'].astype(float)
all_outcomes['max_mcq_pct'] = all_outcomes['max_mcq_pct'].astype(float)
all_outcomes['ti_ooo_score'] = all_outcomes['ti_ooo_score'].astype(float)
all_outcomes['ti_lci_score'] = all_outcomes['ti_lci_score'].astype(float)
all_outcomes['ti_acc_score'] = all_outcomes['ti_acc_score'].astype(float)
all_outcomes['cosine_similarity_query_ft'] = all_outcomes['cosine_similarity_query_ft'].astype(float)
all_outcomes['cosine_similarity_job_ft'] = all_outcomes['cosine_similarity_job_ft'].astype(float)

all_outcomes = feature_engineering.post_process(all_outcomes, prediction_type=PredictionType.OFFLINE)

In [56]:
all_outcomes.columns

Index(['tupleId', 'job_id', 'developer_id', 'feature_lookup_date', 'timestamp',
       'interview_score', 'atleast_1yoe_ratio', 'mean_relevant_mcq_pct',
       'mean_mcq_pct', 'max_skill_yoe', 'max_relevant_mcq_pct',
       'min_relevant_mcq_pct', 'sum_relevant_proj_words', 'atleast_2yoe_ratio',
       'atleast_3yoe_ratio', 'starts_in_weeks', 'old_hourly_rate',
       'seniority_avg', 'max_mcq_pct', 'ti_acc_score', 'ti_lci_score',
       'ti_ooo_score', 'cosine_similarity_job', 'cosine_similarity_query',
       'cosine_similarity_query_ft', 'cosine_similarity_job_ft',
       'model_ttf_score', 'model_restrict', 'model_backfill_score',
       'hm_satisfy', 'hourly_rate'],
      dtype='object')

In [57]:
all_outcomes[f'model_ttf_score'] = model_gbdtv23.predict(all_outcomes[features_columns])

all_outcomes['model_restrict'] =(
        (all_outcomes.ti_ooo_score < 5)
        | (all_outcomes.mean_mcq_pct < 61)
        | (all_outcomes.min_relevant_mcq_pct < 40)
)

all_outcomes['model_backfill_score'] =  np.where(all_outcomes['model_restrict'] == False, all_outcomes['model_ttf_score'], -1)
all_outcomes['hm_satisfy'] =  np.where(all_outcomes['model_backfill_score'] >= threshold, 1, 0)
all_outcomes[['tupleId' ,'job_id', 'developer_id', 'model_backfill_score','hm_satisfy', 'hourly_rate']]
all_outcomes

Unnamed: 0,tupleId,job_id,developer_id,feature_lookup_date,timestamp,interview_score,atleast_1yoe_ratio,mean_relevant_mcq_pct,mean_mcq_pct,max_skill_yoe,...,ti_ooo_score,cosine_similarity_job,cosine_similarity_query,cosine_similarity_query_ft,cosine_similarity_job_ft,model_ttf_score,model_restrict,model_backfill_score,hm_satisfy,hourly_rate
0,11836_11,11836,387,2023-11-29,2023-11-30 12:22:38.916709+00:00,7.0,1.0,,69.714711,1.0,...,,-0.884535,-0.484625,-0.038759,0.090793,0.000039,False,0.000039,0,23.370596
1,11836_11,11836,11172,2023-11-29,2023-11-30 12:22:38.916709+00:00,7.0,1.0,43.889588,34.770218,5.2,...,,-0.887280,-0.484665,0.143015,0.423837,0.000241,True,-1.000000,0,44.643757
2,11836_11,11836,63692,2023-11-29,2023-11-30 12:22:38.916709+00:00,10.0,1.0,96.576155,77.489194,2.6,...,,0.908356,0.590344,-0.045763,0.351010,0.000116,False,0.000116,0,56.017462
3,11836_11,11836,109111,2023-11-29,2023-11-30 12:22:38.916709+00:00,6.0,1.0,,68.556524,1.0,...,,-0.893872,-0.498885,-0.231007,-0.266892,0.000075,False,0.000075,0,40.082911
4,11836_11,11836,116386,2023-11-29,2023-11-30 12:22:38.916709+00:00,8.0,0.5,91.363064,74.687276,1.0,...,,-0.895648,-0.511537,0.171444,0.322384,0.000366,False,0.000366,0,24.173153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1080452,15609_314,15609,3825452,2023-11-29,2023-11-30 12:22:38.916709+00:00,,0.0,,87.892108,0.0,...,,,,,,0.000364,False,0.000364,0,21.203692
1080453,15609_314,15609,3827630,2023-11-29,2023-11-30 12:22:38.916709+00:00,8.0,0.0,,87.528491,0.0,...,,,,,,0.000154,False,0.000154,0,28.040044
1080454,15609_314,15609,3827821,2023-11-29,2023-11-30 12:22:38.916709+00:00,5.0,0.0,,56.953118,0.0,...,,,,,,0.000195,True,-1.000000,0,30.099260
1080455,15609_314,15609,3829566,2023-11-29,2023-11-30 12:22:38.916709+00:00,5.0,0.0,,93.899486,0.0,...,,,,,,0.000340,False,0.000340,0,35.868364


In [58]:
import pandas_gbq
# Replace 'your_project_id', 'your_dataset_id', and 'your_table_id' with your actual values
project_id = 'turing-dev-337819'
dataset_id = 'pdsa'
table_id = 'phm_30_11'

# Insert the DataFrame into the BigQuery table

pandas_gbq.to_gbq(all_outcomes, f'{dataset_id}.{table_id}', project_id=project_id, if_exists='replace')

100% 1/1 [00:00<00:00, 11491.24it/s]
