## Importing Libraries and functions 

In [8]:
from google.cloud import bigquery
from datetime import timedelta
import pandas as pd
import numpy as np
import pandas_gbq
import copy
import re
import pickle
import json
import datetime as dt
from pins import board_rsconnect
from datetime import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from collections import Counter


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500) 


def GBQ_data(query_string):
    client = bigquery.Client('turing-230020')
    query = client.query(query_string)
    results = query.result()
    return results.to_dataframe()


## Fetching data

In [9]:
job_mcq_must = """with tabular_data as
(
SELECT 
    job.job_id, 
    job.skill_id, 
    skill.skill_name
from `turing-230020.devdb_mirror.ms2_job_skill` job
LEFT join `turing-230020.devdb_mirror.base_all_skills_v4` skill on job.skill_id=skill.id
where job.job_skill_level_id=1
)

SELECT 
    job_id, 
    array_agg(distinct skill_id order by skill_id) as must_have_skill_ids,
    array_agg(distinct skill_name order by skill_name) as must_have_skill_names 
from tabular_data 
group by 1
"""


job_mcq_nice = """with tabular_data as
(
SELECT 
    job.job_id, 
    job.skill_id, 
    skill.skill_name
from `turing-230020.devdb_mirror.ms2_job_skill` job
LEFT join `turing-230020.devdb_mirror.base_all_skills_v4` skill on job.skill_id=skill.id
where job.job_skill_level_id=2
)

SELECT 
    job_id, 
    array_agg(distinct skill_id order by skill_id) as nice_have_skill_ids,
    array_agg(distinct skill_name order by skill_name) as nice_have_skill_names 
from tabular_data 
group by 1
"""

job_mcq_must = GBQ_data(job_mcq_must)
job_mcq_nice = GBQ_data(job_mcq_nice)
print(job_mcq_must.shape)
print(job_mcq_nice.shape)

job_mcq_must['must_have_skill_ids_tuple'] = job_mcq_must.must_have_skill_ids.apply(lambda x:tuple(x))
job_mcq_nice['nice_have_skill_ids_tuple'] = job_mcq_nice.nice_have_skill_ids.apply(lambda x:tuple(x))
job_mcq_must['num_must_have_skills'] = job_mcq_must.must_have_skill_ids.apply(lambda x:len(x))
job_mcq_nice['num_nice_have_skills'] = job_mcq_nice.nice_have_skill_ids.apply(lambda x:len(x))

job_mcq = job_mcq_must.merge(job_mcq_nice, how='outer', on='job_id')
job_mcq.shape

(8107, 3)
(4157, 3)


(8115, 9)

In [10]:
bulk_query = """SELECT * except(rn) from(Select *, row_number() over (partition by job_id order by opportunity_created_date) as rn
from matchingmetrics.opps_positions) where rn=1"""
bulk_data = GBQ_data(bulk_query)

In [11]:
all_turing_metrics = """SELECT * except(rn) from (
Select *, row_number() over(partition by job_id order by opportunity_created_date desc) as rn
from `turing-230020.matchingmetrics.all_turing_metrics`
) WHERE rn=1"""
matching_metrics = GBQ_data(all_turing_metrics)
print(matching_metrics.shape)


ms2_job_query = """SELECT job.*, status.status, role.role_type, pr.priority 
from `turing-230020.devdb_mirror.ms2_job` job
left join `turing-230020.devdb_mirror.ms2_job_status` status on status.id = job.job_status_id
left join `turing-230020.devdb_mirror.ms2_job_role_type` role on job.role_type_id = role.id
left join `turing-230020.devdb_mirror.ms2_job_priority` pr on job.priority_id = pr.id;"""

ms2_job = GBQ_data(ms2_job_query)
ms2_job.rename(columns = {'id':'job_id'}, inplace=True)
ms2_job.shape

(8292, 71)


(9593, 58)

In [12]:
ss_query = """
Select jobId, count(*) ss_queries from(
SELECT
  ssua.id,
  created_at as activity_date,
  ssu.customer_category,
  ARRAY_LENGTH(SPLIT(json_query(payload, '$.developers'),",")) as dev_count,
  ARRAY_LENGTH(SPLIT(json_query(payload, '$.skillIds'),",")) as skill_count,
  JSON_QUERY(payload, '$.developers') devs,
  JSON_QUERY(payload, '$.skillIds') skills,
  JSON_QUERY(payload, '$.role') role,
  JSON_QUERY(payload, '$.company') company,
  JSON_QUERY(payload, '$.roleTypeId') roleTypeId,
  JSON_QUERY(payload, '$.jobStatusId') jobStatusId,
  JSON_QUERY(payload, '$.jobId') jobId,
  JSON_QUERY(payload, '$.continentIds') continentIds,
  JSON_QUERY(payload, '$.seniorityLevel') seniorityLevel,
  JSON_QUERY(payload, '$.maxAcceptableRate') maxAcceptableRate,
  JSON_QUERY(payload, '$.mustHaveTotalYearsOfExperience') mustHaveTotalYearsOfExperience,
  JSON_QUERY(payload, '$.hourlyRate') hourlyRate,
  case when customer_category="Platinum" or customer_category is null then "Enterprise" else "FSS" end as category_type
FROM
  devdb_mirror.self_serve_user_activity ssua  
  left join devdb_mirror.self_serve_user ssu on ssua.user_id= ssu.id
  where (ssu.email not like "%turing.com" or (ssua.action="SIGN_IN_FAILED") and REPLACE(JSON_EXTRACT(payload, '$.email'), '"', '') not like "%turing.com")
-- and json_query(payload, '$.developers') is not null and json_query(payload, '$.skillIds') is not null
  and ARRAY_LENGTH(SPLIT(json_query(payload, '$.developers'),",")) >0
  and ARRAY_LENGTH(SPLIT(json_query(payload, '$.skillIds'),",")) >0 ) t1 group by 1
"""

ss_query = GBQ_data(ss_query)
ss_query.shape
ss_query.head()


ms_query = """
with 
matcher_ids as (
SELECT
  vertical_eng_leader_id as user_id,
  id as job_id
FROM
  `turing-230020.devdb_mirror.ms2_job`
WHERE
  ((customer_email IS NOT NULL AND customer_email NOT LIKE "%turing.com%")
  OR (customer_email IS NULL))
  AND vertical_eng_leader_id NOT IN (32, 110)
)
, sl_creator_ids as (
SELECT DISTINCT
  mjm.creator_id as user_id
FROM
  `turing-230020.devdb_mirror.ms2_job_match` mjm
LEFT JOIN
  `turing-230020.devdb_mirror.ms2_job` mj
ON
  mjm.job_id = mj.id
LEFT JOIN
  `turing-230020.devdb_mirror.cal_users` u
ON
  u.id = mjm.creator_id
WHERE
  mj.customer_email NOT LIKE "%turing.com%"
  AND mjm.creator_id IS NOT NULL
)
, all_users as (
select user_id as user_id from sl_creator_ids
union all
select user_id from matcher_ids
)
SELECT
  msdsl.job_id, count(*) as ms_num_queries
FROM
  `turing-230020.devdb_mirror.ms2_developer_search_logs` msdsl
LEFT JOIN
  `turing-230020.raw.advanced_developer_search_log_bq` adsl
ON
  msdsl.uuid=adsl.uuid
WHERE
  (JSON_EXTRACT_SCALAR(msdsl.matching_query,"$.email") = "" OR JSON_EXTRACT_SCALAR(msdsl.matching_query,"$.email") IS NULL)
  AND (JSON_EXTRACT_SCALAR(msdsl.matching_query,"$.fullName") = "" OR JSON_EXTRACT_SCALAR(msdsl.matching_query,"$.fullName") IS NULL)
  AND JSON_EXTRACT_SCALAR(msdsl.matching_query,"$.phoneNumber") = ""
  AND JSON_EXTRACT_SCALAR(msdsl.matching_query,"$.developerIds") = ""
  AND JSON_EXTRACT_SCALAR(msdsl.matching_query,"$.githubUrl") = ""
  AND JSON_EXTRACT_SCALAR(msdsl.matching_query,"$.linkedinUrl") = ""
  AND page_number = 1
  AND msdsl.user_id in (SELECT user_id from all_users)
  group by 1
"""

ms_query = GBQ_data(ms_query)

ms_query['job_id'] = ms_query['job_id'].astype(float)
ss_query['job_id'] = ss_query['jobId'].astype(float)
ss_query.drop(['jobId'], axis=1, inplace=True)
query_data = ms_query.merge(ss_query, how='outer', on='job_id')
query_data[['ms_num_queries', 'ss_queries']] = query_data[['ms_num_queries','ss_queries']].fillna(value=0)
query_data['queries'] = query_data['ms_num_queries'] + query_data['ss_queries']
query_data = query_data.loc[~query_data['job_id'].isna()].reset_index(drop=True)
query_data['job_id'] = query_data['job_id'].astype(int)
query_data.head()

Unnamed: 0,job_id,ms_num_queries,ss_queries,queries
0,5132,28,0,28
1,4978,4,0,4
2,5640,3,14,17
3,4361,6,0,6
4,5136,3,0,3


In [13]:
interview_data = """SELECT job_id, 
sum(case when interview_requested=1 then 1 else 0 end) as interviews_requested,
sum(case when interview_scheduled=1 then 1 else 0 end) as interviews_scheduled, 
sum(case when interview_happened=1 then 1 else 0 end) as interviews_happened,
sum(case when interview_happened=1 and interview_result='passed' then 1 else 0 end) as interviews_passed,
sum(case when interview_happened=1 and interview_result='failed' then 1 else 0 end) as interviews_failed
 from(
Select ir.job_id, ir.developer_id, ir.job_match_id, ir.job_status, ir.customer_category,
ir.si_date, ir.is_date, ir.ih_date, ir.interview_requested, ir.interview_scheduled, ir.interview_happened,
ir.kanban_last_interview_result, mjms.status,
CASE when mjms.status in ('Hard Interview Rejection','Soft Interview Rejection','Rejected') or 
     kanban_last_interview_result = 'failed' then 'failed' 
     else kanban_last_interview_result END as interview_result
 from 
matchingmetrics.interview_requests ir LEFT JOIN devdb_mirror.ms2_job_match mjm on ir.job_match_id = mjm.id
LEFT JOIN devdb_mirror.ms2_job_match_status mjms on mjm.job_match_status_id  = mjms.id
) t1 group by 1;"""
interview_data = GBQ_data(interview_data)

In [14]:
engagement_days = """SELECT job_id, round(avg(engagement_days),0) as engagement_days FROM `turing-230020.analytics_views.opportunity_value` 
where job_id is not null group by 1 order by 2 desc;"""

engagement_days = GBQ_data(engagement_days)

In [15]:
matching_metrics_cols = ['opportunityid', 'job_id', 'opportunity_created_date', 'client_type', 'leadsource', 'client_category', 'region', 'chosen_type', 'slp_date', 'pcp_date', 'psp_date', 'ir_date', 'is_date', 'ih_date', 'dc_date', 'signed_date', 'trial_date', 'start_date', 'chosen_dev_id', 'dc14']
ms2_job_cols = ['job_id', 'company', 'role', 'role_type', 'is_deleted', 'max_acceptable_rate', 'job_value', 'number_of_open_roles', 'opportunity_status', 'status',  'must_have_total_years_of_experience', 'nice_have_total_years_of_experience', 'priority' ]
bulk_data_cols = ['job_id', 'total_positions', 'total_open_positions__c', 'total_positions_by_account']
#job_mcq_cols = ['job_id', 'num_skills', 'num_mcqs', 'num_must_have_skills', 'num_nice_have_skills', 'must_have_skill_ids', 'nice_have_skill_ids', 'must_have_skill_names', 'nice_have_skill_names']

In [None]:
print(ms2_job.shape[0] , len(ms2_job['job_id'].unique()))
print(matching_metrics.shape[0] , len(matching_metrics['job_id'].unique()))
print(bulk_data.shape[0] , len(bulk_data['job_id'].unique()))  ## opportunityid is unique same as matching_metrics data
print(job_mcq.shape[0] , len(job_mcq['job_id'].unique()))
print(query_data.shape[0] , len(query_data['job_id'].unique()))
print(interview_data.shape[0] , len(interview_data['job_id'].unique()))

9593 9593
8292 8292
8297 8297
8115 8115
3815 3815
4451 4451


Unnamed: 0,job_id,company,role,role_type,is_deleted,max_acceptable_rate,job_value,number_of_open_roles,opportunity_status,status,must_have_total_years_of_experience,nice_have_total_years_of_experience,priority,opportunityid,opportunity_created_date,client_type,leadsource,client_category,region,chosen_type,slp_date,pcp_date,psp_date,ir_date,is_date,ih_date,dc_date,signed_date,trial_date,start_date,chosen_dev_id,dc14,total_positions,total_open_positions__c,total_positions_by_account,must_have_skill_ids,must_have_skill_names,must_have_skill_ids_tuple,num_must_have_skills,nice_have_skill_ids,nice_have_skill_names,nice_have_skill_ids_tuple,num_nice_have_skills,ms_num_queries,ss_queries,queries,interviews_requested,interviews_scheduled,interviews_happened,interviews_passed,interviews_failed,engagement_days
0,59,ELSA,Full-Stack Python / Vue OR React,,1,,,1,,Creating Shortlist,,,,,NaT,,,,,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,
1,55,Cloud One,Senior Full-Stack PHP + JS,,0,,3.9,1,,Trial Succeeded,,,,0061U00000DeXrCQAV,2019-11-28 18:58:20+00:00,existing,existing,4.Bronze,2.Rest of US,matching,2019-12-10 22:38:10+00:00,2019-12-10 22:38:10+00:00,2019-12-10 22:38:10+00:00,2019-12-10 22:38:10+00:00,2019-12-10 22:38:10+00:00,2019-12-10 22:38:10+00:00,2019-12-10 22:38:10+00:00,2019-12-10 22:38:10+00:00,2019-12-12 00:00:00+00:00,2019-12-12 00:00:00+00:00,136695,1,0.0,,3.0,"[93, 165]","[JavaScript, PHP]","(93, 165)",2.0,,,,,,,,,,,,,103.0
2,3,treehubapp,Full stack,,0,,7.4,1,,Trial Succeeded,,,,0061U00000E8sz5QAB,2019-12-11 02:37:57+00:00,new,new,4.Bronze,2.Rest of US,matching,2019-12-11 02:37:57+00:00,2019-12-11 02:37:57+00:00,2019-12-11 02:37:57+00:00,2019-12-11 02:37:57+00:00,2019-12-11 02:37:57+00:00,2019-12-11 02:37:57+00:00,2019-12-11 02:37:57+00:00,2019-12-19 18:10:21+00:00,2019-12-18 00:00:00+00:00,2019-12-18 00:00:00+00:00,249794,1,1.0,,1.0,"[120, 2031]","[Node.js, React]","(120, 2031)",2.0,,,,,,,,,,,,,469.0
3,7,Jisr.net,Full-Stack Rails Developer,,1,,,1,,Creating Shortlist,,,,,NaT,,,,,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,
4,112,Chad Connects,Front-End Vue.js Developer,,1,,,1,,Preparing Packet,,,,,NaT,,,,,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9588,7569,HigherEdLab.com,[Self-Serve test job - Please ignore] Backend ...,Web Backend,0,,,,,Soliciting Further Requirements,2,,,,NaT,,,,,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,[2036],[Angular],"(2036,)",1.0,,,,,,,,,,,,,
9589,6184,unboring company,[Test] Frontend - Lead Engineer,Web Frontend,0,,,1,,Paused,2,2,,,NaT,,,,,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,[2031],[React],"(2031,)",1.0,[2031],[React],"(2031,)",1.0,,,,,,,,,
9590,6188,HigherEdLab.com,[Test 3] Frontend - Senior Engineer,Web Frontend,0,,,,,Creating Workflow,2,,,,NaT,,,,,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,[2031],[React],"(2031,)",1.0,,,,,,,,,,,,,
9591,7989,Turing,Do not use Mansa Test Job Backend - Mid-level ...,Web Backend,0,,,,,Creating Shortlist,2,,,,NaT,,,,,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,[2036],[Angular],"(2036,)",1.0,,,,,,,,,,,,,


In [None]:
global_data = pd.DataFrame()
global_data = ms2_job[ms2_job_cols].merge(matching_metrics[matching_metrics_cols], how='left', on='job_id')
global_data = global_data.merge(bulk_data[bulk_data_cols], how='left', on='job_id')
global_data = global_data.merge(job_mcq, how='left', on='job_id')
global_data = global_data.merge(query_data, how='left', on='job_id')
global_data = global_data.merge(interview_data, how='left', on='job_id')
global_data = global_data.merge(engagement_days, how='left', on='job_id')

In [None]:
API_KEY = 'yDAssnMUtqatxoOpyNgYVKZcXfLP3vwD' 
SERVER = 'https://rstudio-connect.turing.com/'
board = board_rsconnect(server_url=SERVER, api_key=API_KEY)
board.pin_write(global_data, "muhammad_usman/demand-basic-data", type="csv")
print(f'Pin muhammad_usman/demand-basic-data has been updated with shape {global_data.shape}')

Writing pin:
Name: 'muhammad_usman/demand-basic-data'
Version: 20220827T031550Z-29146


Pin muhammad_usman/demand-basic-data has been updated with shape (9593, 52)
