# Import data from google big query and store in local
## Imports and global declarations and Functions

In [13]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import pandas_gbq
import copy
import pickle
import re
import glob
import datetime as dt
from datetime import timezone
from sklearn.preprocessing import StandardScaler


# %history -f notebook_file.ipynb #store history in a file

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500) 


## Fetch raw data

In [14]:
signup_data = pd.read_csv('../data/raw/1.0-mu-devlopers-signup-data.csv', index_col=0)
print(signup_data.shape)
signup_data.head()

(57894, 32)


Unnamed: 0,dev_id,signup_date,source_attribution_type,user_os,user_os_type,quiz_answer,resume_flag,resume_upload_date,num_chars_resume,years_of_experience,years_of_remote_experience,english_communication,verbal_communication,hourly_rate,linkedin_url,github_url,role_type,is_fast_tracked,country_name,country_group,time_to_upload_resume,sn_avg_score,num_self_skills,num_self_beginner_skills,num_self_intermediate_skills,num_self_advanced_skills,num_self_expert_skills,most_exp_skill_id,most_exp_skill_name,quiz_language,cluster_label,time_to_sn_test
0,910749,2021-02-24 09:49:52+00:00,Undefined,Mac OS,Mac OS,MAX_NUM = NUM,True,2021-02-24 10:00:25+00:00,2222.0,3.0,1.0,Average,1.0,,https://www.linkedin.com/in/bruno-alfred-a87ab...,https://github.com/brunoalfred/,Mobile,0.0,"Tanzania, United Republic of",Africa,0.0,3.595834,25.0,20.0,5.0,0.0,0.0,392.0,Flutter,,Low Quality,0.0
1,1294930,2021-06-04 23:16:43+00:00,Undefined,Linux,Linux,MAX_NUM = NUM,True,2021-06-04 23:17:29+00:00,2410.0,6.0,0.0,Great,0.0,0.0,,,Web Backend,0.0,Algeria,Africa,0.0,4.086666,13.0,4.0,1.0,8.0,0.0,165.0,PHP,,Average,0.0
2,2368912,2022-05-26 07:51:48+00:00,Undefined,Mac OS,Mac OS,MAX_NUM = NUM,True,2022-05-26 07:52:01+00:00,1693.0,3.0,1.0,Great,0.0,30.0,,,Web Frontend,0.0,Kazakhstan,Rest of Asia,0.0,4.456666,14.0,1.0,8.0,3.0,2.0,2031.0,React,,Low Experience,0.0
3,2429835,2022-06-11 03:03:04+00:00,Facebook,Android OS,Android OS,MAX_NUM += NUM,True,2022-06-11 03:08:47+00:00,2552.0,4.0,2.0,Average,1.0,,https://www.linkedin.com/in/javier-richards-va...,https://github.com/jrussellrichards,Machine Learning,0.0,Chile,Latin and South America,0.0,3.454166,8.0,3.0,3.0,2.0,0.0,114.0,SQL,,Low Experience,5.0
4,2639530,2022-09-07 17:17:19+00:00,Undefined,,Undefined,,True,2022-09-07 17:19:57+00:00,1807.0,1.0,1.0,Average,1.0,,www.linkedin.com/in/efrainmejiar,,Fullstack (BE-heavy),0.0,Venezuela,Latin and South America,0.0,2.955834,13.0,0.0,13.0,0.0,0.0,400.0,Spring Boot,,Low Quality,0.0


In [15]:
data_self_skill = pd.read_csv('../data/raw/1.0-mu-devlopers-self_skill-data.csv', index_col=0)
print(data_self_skill.shape)
data_self_skill.head(5)

(1043771, 7)


Unnamed: 0,developer_id,skill_id,skill_name,skill_level_int,yoe_in_skill,vetted_skill,skill_level
0,1510,223,Tensorflow,1,1.0,0,beginner
1,1546,55,Docker,1,1.0,1,beginner
2,2740,836,AWS DevOps,1,1.0,0,beginner
3,4161,433,AWS,1,1.0,0,beginner
4,4671,1397,AngularJS,1,1.0,1,beginner


In [16]:
data_self_skill_piv = data_self_skill.pivot_table(index='developer_id', columns='skill_name', values='skill_level_int').reset_index()
data_self_skill_piv.index.name = None
data_self_skill_piv.columns.name = None
data_self_skill_piv.fillna(0, inplace=True)
print(data_self_skill_piv.shape)

del data_self_skill

data_self_skill_piv = data_self_skill_piv.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
data_self_skill_piv.rename(columns={'developer_id':'dev_id'}, inplace=True)
data_self_skill_piv = data_self_skill_piv.merge(signup_data[['dev_id', 'cluster_label']], how='left', on='dev_id')


(57975, 823)


In [5]:
data_self_skill_piv.head()

Unnamed: 0,dev_id,NET,NETCore,NETMVC,3DGraphics,3PAPIsLibraries,ABTesting,ADONET,AEM,AESRSAEncryption,AJAX,ALProgramming,AMPAcceleratedMobilePages,APIDesign,APIDocumentation,APIIntegrations,AS3,ASPNET,ASPNETBoilerplate,ASPNETCore,ASPNETMVC,ASPNETWebForms,ASPNETC,AWS,AWSAdministration,AWSAmplify,AWSAurora,AWSCLI,AWSCognito,AWSDevOps,AWSEMR,AWSGlue,AWSIAM,AWSIoT,AWSLambda,AWSOperations,AWSRDS,AWSSolutionsArchitecture,AcceptanceTesting,Access,Administration,AdobeAir,AdobeIllustrator,AdobeInDesign,AdobePhotoshop,AdobeXD,AdonisJS,Agile,Airflow,Akka,AlgorithmsandDataStructures,Alteryx,AmazonFirehose,AmazonKinesis,AmazonRedshift,Android,AndroidJava,AndroidHybridAppDevelopment,AndroidSDK,AndroidStudio,AndroidTesting,AndroidJava.1,AndroidKotlin,Angular,Angular2,AngularCLI,AngularJS,Ansible,AntDesign,Apache,ApacheBeam,ApacheFlume,ApacheKafka,ApacheMesos,ApacheNiFi,ApachePulsar,ApacheSolr,ApacheSpark,ApacheStorm,ApexProgramming,Apollo,ApolloClient,ApolloServer,AppMesh,ArcGISJavaScriptAPI,Arduino,Arista,ArtificialIntelligence,ArtificialNeuralNetworks,Assembly,Atlassian,Atom,Auth0,AutoIT,Automatics,AutomationTestToolkit,AutomationTesting,Axio,Axure,Azure,AzureCloudSQL,AzureCosmosDB,AzureDataFactory,AzureDataLake,AzureDatabricks,AzureDevOps,AzureEventhub,AzureFunctionApp,AzureModernData,BDDBehaviorDrivenDevelopment,BIAdmin,Babel,BackEndDevelopment,Backbone,Bash,Benchmarks,BigData,BigQuery,Bitbucket,BizTalkServer,Blender,Blockchain,BluePrint,Bootstrap,BorlandDelphi,Braze,BusinessAnalysis,BusinessApplicationDevelopment,BusinessIntelligence,BusinessIntelligenceEngineering,C,C.1,C.2,CALDevelopment,CICD,COBOL,CORBA,CSS,CSSModules,CSS3,CSSAnimation,CSSFlex,CSSGrid,CSSMediaQueries,Caffe,CakePHP,CalDavCardDav,Cassandra,Celery,Chai,Charlesproxy,Chatbot,Chef,ChromeExtensions,CircleCI,Cisco,Citrix,ClassicASP,ClassicVB,CleanArchitecture,Clojure,ClojureScript,Cloud,CloudMigration,CloudSecurity,CocosCreator,CodeAnalysis,CodeReviews,CodeIgniter,CoffeeScript,Cognos,ColdFusion,ColdfusionCFML,Communication,Compass,Composer,ComputerScienceFundamentals,ComputerVision,Confluence,Consul,Cordova,CoreJava,CoreOS,CouchDB,CouchbaseDB,Cplex,Crawlers,CriticalThinking,Crypto,CrystalReports,CustomerInteraction,CyberSecurity,Cypress,D3js,DAC,DB2UniversalDatabaseUDB,DNS,DOM,Dagger,Dart,DataAnalysis,DataAnalyst,DataCleansing,DataEngineer,DataEngineering,DataExtraction,DataMigration,DataModeling,DataPipelines,DataScience,DataSpooling,DataWarehousing,DatabaseAdministration,DecisionTrees,DeepLearning,DeepLearningAlgorithms,Delphi,DesignPatterns,DesignThinking,Designer,DevExpress,DevOps,DigitalMarketing,DisasterRecovery,DiscriminantAnalysis,DistributedFileSystems,DistributedProgramming,DistributedStorage,DistributedSystems,Divi,Django,DjangoRestFramework,Docker,DomainDrivenDesign,DomainSpecificLanguages,Dplyr,Drupal,DuckDuckGo,Durandaljs,Dynamics365BusinessCentral,DynamoDB,ECommerce,E2ETesting,EC2,EDA,EJBComponentProgramming,ELKstack,ELM,EMV,EOIR,...,RAID,RDBMS,RDLCReporting,RESTAPIsTesting,RESTRESTfulAPIs,RMI,RPA,RSpec,RTOS,RabbitMQ,Rake,RationalClearcase,Ray,React,ReactHooks,ReactNative,ReactiveX,RealTimeSystems,Realm,Recoil,Redis,Redux,ReduxSaga,RefinementofModels,RegularExpressions,ReinforcementLearning,RemoteSensing,ReportingServices,ReportingandTrackingSoftwareDefects,RequirementGathering,Retesting,Robotics,Roku,Routing,Ruby,RubyonRails,Rust,RxJS,RxJava,RxSwift,SAML,SAP,SAPABAP,SAPABAPHR,SAPB1SAPBusinessOne,SAPBODSSAPBODataServices,SAPBusinessObjects,SAPHANASDI,SAPProcessIntegration,SAPUI5,SAR,SAS,SASS,SCSS,SDLC,SEO,SFDC,SMTP,SOAP,SPA,SQL,SQLTuning,SQLalchemy,SQLite,SSAS,SSH,SSIS,SSO,SSRS,STM32,SVG,SaaS,Sailsjs,Salesforce,SalesforceAdmin,SalesforceDevelopment,SalesforceLightning,SalesforceLightningAuraComponents,SaltStack,Scala,SceneKit,ScikitLearn,Scipy,Scrum,Security,Selenide,Selenium,SeleniumWebDriver,Sequelize,ServerSentEvents,Serverless,ServiceWorkers,ServiceNow,Servlets,ShellScripting,Shopify,SignalProcessingandTransmission,Sinatra,Sitecore,Sketch,Slick,SmartCommunication,SocialMediaMarketing,SocketProgramming,Socketio,SoftwareArchitecture,SoftwareDevelopment,SoftwarePackagingandMaintenance,SoftwareQualityAssurance,SoftwareTesting,Solidity,SolutionArchitecture,SoundandAudio,SpaCy,Spanner,Spark,Spinnaker,Splunk,Spring,SpringBoot,SpringFramework,SpringMVC,SpringBoot.1,SpriteKit,StackKnowledge,StanfordCoreNLP,Stash,Statistics,StatsD,Stripe,Struts2,StyledComponents,Stylus,Subversion,SuiteCRM,Svelte,Swift,SwiftUI,Switching,Symfony,SystemDesign,SystemIntegrationTesting,SystemMonitoringTools,TSQL,TCPIP,TDDTestDrivenDevelopment,TFS,Tableau,TailwindCSS,TalendOpenStudio,TeamManagement,TeamPlayer,Teamcenter,TechLead,Telerik,Tensorflow,Tensorflow1,Terraform,TestDrivenDesign,TestPlanning,Theano,ThirdpartyAPIs,Thrift,Thunderhead,TicketingTools,Torch,Traefik,TravisCI,Troubleshooting,TwitterAPI,Typescript,UIDesign,UIDevelopment,UIPath,UXDesign,Umbraco,UnitTesting,Unity,Unity3D,Unix,UnixDevelopmentEnvironments,UserResearch,UserCenteredDesign,VBA,VIPER,VMware,VPN,VSS,Vaadin,Vagrant,Vault,Video,VirtualReality,VisualBasic,VisualBasicNET,VisualC,VisualStudio,Visualforce,VoIP,VoiceRecognition,Vuejs,Vuex,WPF,WSO2,WebAPI,WebAccessibility,WebArchitecture,WebComponents,WebDesign,WebDevelopment,WebPenetrationTesting,WebServices,WebSockets,WebWorkers,Web3,WebRTC,WebScrape,Webpack,WebsiteDevelopment,WebsitesPerformanceOptimization,WhiteHatLinkBuilding,Windchill,Windows,WindowsApplications,WindowsSDK,WindowsServer2012,WindowsServices,WireframingPrototyping,WooCommerce,WordPress,XHTML,XML,XSLT,XUL,Xamarin,Xcode,YAML,Yarn,Yii2Framework,YiiFramework,ZendFramework,Zeplin,Zookeeper,iMacros,iOSSwift,iOSDevelopment,iSCSICIFS,jQuery,mikroC,cluster_label
0,121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Quality
1,126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,High Quality
2,138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Quality
3,312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,Low Quality
4,361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,High Quality


In [6]:
data_self_skill_piv.dropna(inplace=True)

In [17]:
data_self_skill_piv['label'] = np.where(data_self_skill_piv['cluster_label'].isin(['High Quality', 'Elite']), 1,0)

data_self_skill_piv['label'].value_counts()

0    45279
1    12696
Name: label, dtype: int64

In [8]:
from sklearn.feature_selection import mutual_info_classif as MIC
#sklearn.feature_selection.mutual_info_classif(X, y, *, discrete_features='auto', n_neighbors=3, copy=True, random_state=None)
mi_score = MIC(data_self_skill_piv.loc[:, ~data_self_skill_piv.columns.isin(['dev_id', 'cluster_label', 'label'])],data_self_skill_piv['label'])


In [18]:
col_list = [val for val in data_self_skill_piv.columns if val not in ['dev_id', 'cluster_label', 'label']]
best_skills = []

for i in [val for val in np.where(mi_score >= 0.01)[0]]:
    best_skills.extend([col_list[i]])
    
best_skills.extend(['dev_id'])
signup_data = signup_data.merge(data_self_skill_piv[best_skills], how='left', on='dev_id')

In [19]:
del data_self_skill_piv

predictors = ['source_attribution_type',   'user_os_type', 'role_type', 'english_communication',\
              'resume_flag', 'num_chars_resume_thousands', 'years_of_experience','years_of_remote_experience',
              'sn_avg_score','num_self_skills', 'num_self_beginner_skills', 'num_self_intermediate_skills',
              'num_self_advanced_skills', 'num_self_expert_skills', 'is_github', 'is_linkedIn', 'time_to_upload_resume',
              'time_to_sn_test','is_quiz_answer_correct', 'most_exp_skill_name']

predictors.extend(['cluster_label'])
predictors.extend(best_skills)

signup_data['resume_flag'] = signup_data['resume_flag'].astype(int)
signup_data['num_chars_resume_thousands'] = signup_data['num_chars_resume']/1000
signup_data['is_github'] = np.where(signup_data['github_url'].isna(), 0,1)
signup_data['is_linkedIn'] = np.where(signup_data['linkedin_url'].isna(), 0,1)
signup_data['is_quiz_answer_correct'] = np.where(signup_data['quiz_answer']=='MAX_NUM = NUM', 1,0)

ml_data = signup_data[predictors].copy()

ml_data['num_chars_resume_thousands'] = np.where(ml_data['resume_flag']==0, -10,ml_data['num_chars_resume_thousands'])
ml_data['time_to_upload_resume'] = np.where(ml_data['resume_flag']==0, -10,ml_data['time_to_upload_resume'])
ml_data['num_chars_resume_thousands'] = np.where(ml_data['num_chars_resume_thousands'].isna(), np.mean(ml_data['num_chars_resume_thousands']) ,ml_data['num_chars_resume_thousands'])
ml_data['time_to_upload_resume'] = np.where(ml_data['time_to_upload_resume'].isna(), np.mean(ml_data['time_to_upload_resume']) ,ml_data['time_to_upload_resume'])

#ml_data.dropna(subset=['sn_avg_score','num_self_skills'], inplace=True)

# after dropping developers who have taken seniority test we left with less than 100 NAs only

ml_data.dropna(subset=['years_of_experience', 'sn_avg_score', 'english_communication', 'role_type', 'num_self_skills'], inplace=True)
print(ml_data.isnull().sum())

## Store raw data

In [39]:
if ml_data['dev_id'].duplicated().any():
    print('Data has duplicated dev_id')
else:  
    ml_data.to_csv('../data/processed/' + '1.1-mu-devlopers-processed-data.csv')
    print(f'Global Data of shape {ml_data.shape} stored in a csv successfully')
    

Global Data of shape (52665, 27) stored in a csv successfully


In [40]:
del signup_data, ml_data