# Import data from google big query and store in local
## Imports and global declarations and Functions

In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import pandas_gbq
import copy
import copy
import pickle
import glob
import datetime as dt
from datetime import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans

# %history -f notebook_file.ipynb #store history in a file

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500) 

def GBQ_data(query_string):
    client = bigquery.Client('turing-230020')
    query = client.query(query_string)
    results = query.result()
    return results.to_dataframe()

## Download raw data

In [7]:
data_query = """with p1_data as (Select p1.dev_id, p1.signup_date, p1.source_attribution_type, p1.user_os, p1.user_os_type,
p1.quiz_answer, p1.resume_flag, p1.resume_upload_date, dd.resume_plain, CHAR_LENGTH(dd.resume_plain) as num_chars_resume,
p1.years_of_experience, p1.years_of_remote_experience, p1.english_communication, dd.verbal_communication,
dd.hourly_rate, dd.linkedin_url, dd.github_url, mjrt.role_type,
dd.is_fast_tracked, ci.country_name, ci.country_group,
DATE_DIFF(p1.resume_upload_date , p1.signup_date, DAY) time_to_upload_resume
FROM `turing-230020.analytics_views.phase1_dev_level_data` p1
LEFT JOIN `turing-230020.devdb_mirror.developer_detail` dd on p1.dev_id=dd.user_id
LEFT JOIN `turing-230020.analytics_views.country_information` ci on dd.country_id=ci.country_id
left join `turing-230020.devdb_mirror.ms2_job_role_type` mjrt on dd.role_type_id=mjrt.id
WHERE p1.dev_id in (Select distinct dev_id from `turing-dev-337819.pdsa.PDAS_P2_cluster` where cluster is not null)
),

sn_score as(SELECT
 dcs.user_id AS dev_id, AVG(dweas.avg_score) AS sn_avg_score, max(submit_time) sn_submit_time
FROM
  devdb_mirror.dv2_challenge_submit AS dcs
  LEFT JOIN devdb_mirror.dv2_work_experience_avg_score AS dweas ON dcs.submit_id = dweas.submit_id
  where user_id in (Select distinct dev_id from `turing-dev-337819.pdsa.PDAS_P2_cluster` where cluster is not null)
  and dweas.avg_score is not null
GROUP BY 
  1),
  
self_skill as(SELECT developer_id as dev_id, count(distinct skill_id) num_self_skills,
sum(case when skill_level_int=1 then 1 else 0 end) as num_self_beginner_skills,
sum(case when skill_level_int=2 then 1 else 0 end) as num_self_intermediate_skills,
sum(case when skill_level_int=3 then 1 else 0 end) as num_self_advanced_skills,
sum(case when skill_level_int=4 then 1 else 0 end) as num_self_expert_skills
From(
SELECT developer_id, skill_id, skill_name, skill_level as skill_level_int, score as yoe_in_skill, is_ready_to_match as vetted_skill,
case when skill_level = 1 then 'beginner' when skill_level = 2 then 'intermediate' when skill_level = 3 then 'advanced' when skill_level = 4 then 'expert' END skill_level
from(
select tpm.developer_id, tpm.skill_id,base.skill_name, tpm.skill_level, tpm.score, tpm.is_ready_to_match,
row_number() over(partition by tpm.developer_id, tpm.skill_id order by tpm.updated_date desc) as rn
from devdb_mirror.tpm_developer_skill tpm
left join devdb_mirror.base_all_skills_v4 base on tpm.skill_id = base.id
) where rn=1 and skill_level is not null and developer_id in (Select distinct dev_id from `turing-dev-337819.pdsa.PDAS_P2_cluster` where cluster is not null)
) GROUP by 1
),

quiz_lan as(Select * except(rn) from(SELECT uid as dev_id, test_language as quiz_language, 
row_number() over(partition by uid order by updated_date desc) as rn
FROM `turing-230020.devdb_mirror.submit_list_v4` 
) where rn=1 and dev_id in (Select distinct dev_id from `turing-dev-337819.pdsa.PDAS_P2_cluster` where cluster is not null)

),

skill_with_most_exp as (SELECT developer_id as dev_id, skill_id as most_exp_skill_id, skill_name as most_exp_skill_name
from(
SELECT developer_id, skill_id, skill_name, skill_level as skill_level_int, score as yoe_in_skill, is_ready_to_match as vetted_skill,
case when skill_level = 1 then 'beginner' when skill_level = 2 then 'intermediate' when skill_level = 3 then 'advanced' when skill_level = 4 then 'expert' END skill_level,
row_number() over(partition by developer_id order by score desc) as n_top_skill
from(
select tpm.developer_id, tpm.skill_id,base.skill_name, tpm.skill_level, tpm.score, tpm.is_ready_to_match,
row_number() over(partition by tpm.developer_id, tpm.skill_id order by tpm.updated_date desc) as rn
from devdb_mirror.tpm_developer_skill tpm
left join devdb_mirror.base_all_skills_v4 base on tpm.skill_id = base.id
) where rn=1 and skill_level is not null
) where n_top_skill=1),

cluster_label as(
SELECT dev_id, cluster_label 
from `turing-dev-337819.pdsa.PDAS_P2_cluster` where cluster is not null)

SELECT * except(resume_plain,sn_submit_time),
DATE_DIFF(sn_submit_time , signup_date, DAY) time_to_sn_test
from p1_data
left join sn_score using(dev_id)
left join self_skill using(dev_id)
left join skill_with_most_exp using(dev_id)
left join quiz_lan using(dev_id)
LEFT JOIN cluster_label using(dev_id)
"""

data = GBQ_data(data_query)
print(data.shape)
data.head(2)

(57894, 32)


Unnamed: 0,dev_id,signup_date,source_attribution_type,user_os,user_os_type,quiz_answer,resume_flag,resume_upload_date,num_chars_resume,years_of_experience,years_of_remote_experience,english_communication,verbal_communication,hourly_rate,linkedin_url,github_url,role_type,is_fast_tracked,country_name,country_group,time_to_upload_resume,sn_avg_score,num_self_skills,num_self_beginner_skills,num_self_intermediate_skills,num_self_advanced_skills,num_self_expert_skills,most_exp_skill_id,most_exp_skill_name,quiz_language,cluster_label,time_to_sn_test
0,910749,2021-02-24 09:49:52+00:00,Undefined,Mac OS,Mac OS,MAX_NUM = NUM,True,2021-02-24 10:00:25+00:00,2222,3,1,Average,1,,https://www.linkedin.com/in/bruno-alfred-a87ab...,https://github.com/brunoalfred/,Mobile,0,"Tanzania, United Republic of",Africa,0,3.595834,25,20,5,0,0,392,Flutter,,Low Quality,0
1,1294930,2021-06-04 23:16:43+00:00,Undefined,Linux,Linux,MAX_NUM = NUM,True,2021-06-04 23:17:29+00:00,2410,6,0,Great,0,0.0,,,Web Backend,0,Algeria,Africa,0,4.086666,13,4,1,8,0,165,PHP,,Average,0


In [2]:
slef_skill_query = """
SELECT developer_id, skill_id, skill_name, skill_level as skill_level_int, score as yoe_in_skill, is_ready_to_match as vetted_skill,
case when skill_level = 1 then 'beginner' when skill_level = 2 then 'intermediate' when skill_level = 3 then 'advanced' when skill_level = 4 then 'expert' END skill_level
from(
select tpm.developer_id, tpm.skill_id,base.skill_name, tpm.skill_level, tpm.score, tpm.is_ready_to_match,
row_number() over(partition by tpm.developer_id, tpm.skill_id order by tpm.updated_date desc) as rn
from `devdb_mirror.tpm_developer_skill` tpm
left join `devdb_mirror.base_all_skills_v4` base on tpm.skill_id = base.id
) where rn=1 and skill_level is not null and developer_id in (Select distinct dev_id from `turing-dev-337819.pdsa.PDAS_P2_cluster` where cluster is not null)
"""

slef_skill_data = GBQ_data(slef_skill_query)
print(slef_skill_data.shape)
slef_skill_data.head(2)

(1043771, 7)


Unnamed: 0,developer_id,skill_id,skill_name,skill_level_int,yoe_in_skill,vetted_skill,skill_level
0,1510,223,Tensorflow,1,1,0,beginner
1,1546,55,Docker,1,1,1,beginner


In [4]:
# to check for duplicates 
#data[data['dev_id'].isin(data.loc[data['dev_id'].duplicated()].dev_id)].sort_values(by='dev_id')
#data.drop_duplicates(subset='dev_id', inplace=True, keep='first')


## Store raw data

In [8]:
if data['dev_id'].duplicated().any():
    print('Data has duplicated dev_id')
else:  
    data.to_csv('../data/raw/' + '1.0-mu-devlopers-signup-data.csv')
    print(f'Global Data of shape {data.shape} stored in a csv successfully')
    
slef_skill_data.to_csv('../data/raw/' + '1.0-mu-devlopers-self_skill-data.csv')
print(f'Self SKill Data of shape {slef_skill_data.shape} stored in a csv successfully')

Global Data of shape (57894, 32) stored in a csv successfully
