#### Analytics:
##### I should ask myself some questions:
##### General :
##### 1- What is the total number of answers?
#####            2- Geographical distribution?
#####            3- What about missing answers?

##### Skills (input) :
##### 1- What is the frequency of each skill?
#####                   2- How are the skills correlated with eachother?

##### Jobs (output) :
##### 1- What is the frequency of each Job?
#####                  2- How are the jobs correlated with eachother?

##### Relations between input & output :
##### 1- How are the skills correlated with the jobs?
#####                                     2- What is the specificity of each skill to job?

In [1]:
DF_PATH = "../data/processed/preprocessed_df.pkl"
ROLE_COLS = ['DevType']
TECH_COLS = ['LanguageHaveWorkedWith','DatabaseHaveWorkedWith','PlatformHaveWorkedWith','WebframeHaveWorkedWith','MiscTechHaveWorkedWith','ToolsTechHaveWorkedWith','NEWCollabToolsHaveWorkedWith']

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.express as px
import plotly.graph_objects as go
from plotly.figure_factory import create_dendrogram
from sklearn.preprocessing import StandardScaler

### Read data and preprocess

In [3]:
raw_df = pd.read_pickle(DF_PATH)

In [4]:
#onehot encoding
df = raw_df.copy()
encoded_dfs = {}
for col in ROLE_COLS + TECH_COLS:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(
        binarizer.fit_transform(df[col]),
        columns=binarizer.classes_,
        index =df[col].index
    )
    encoded_dfs[col] = encoded_df 

In [5]:
df = pd.concat(encoded_dfs,axis=1)
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,Designer,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices",...,RStudio,Rider,RubyMine,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,1,0,0
83435,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,1,1,0,0
83436,0,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,1,0,1
83437,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [6]:
#display job frequency
jobs_freq = df['DevType'].copy().sum().sort_values().reset_index()

In [7]:
jobs_freq

Unnamed: 0,index,0
0,Marketing or sales professional,638
1,Scientist,2015
2,"Senior Executive (C-Suite, VP, etc.)",2103
3,"Developer, game or graphics",2112
4,Educator,2369
5,"Engineer, site reliability",2448
6,Academic researcher,2899
7,Product manager,3074
8,Other (please specify):,3545
9,"Developer, QA or test",3611


In [8]:
#Display skill frequency
skills_freq = df.copy().drop('DevType',axis =1).sum().reset_index()
skills_freq.columns = ['group' ,'skill' ,'freq']

In [9]:
skills_freq.sort_values('freq',ascending=False)

Unnamed: 0,group,skill,freq
95,ToolsTechHaveWorkedWith,Git,68171
122,NEWCollabToolsHaveWorkedWith,Visual Studio Code,58026
19,LanguageHaveWorkedWith,JavaScript,53587
16,LanguageHaveWorkedWith,HTML/CSS,46259
29,LanguageHaveWorkedWith,Python,39792
...,...,...,...
12,LanguageHaveWorkedWith,Erlang,651
0,LanguageHaveWorkedWith,APL,536
8,LanguageHaveWorkedWith,Crystal,466
6,LanguageHaveWorkedWith,COBOL,437


In [10]:
fig = px.treemap(skills_freq,
                 path=['group','skill'],
                 color_continuous_scale='deep',
                 values='freq',color='freq')

fig.update_layout(width=1400 , height=700)
fig.show()

In [11]:
#Jobs & skills heatmap
sorted_roles = df['DevType'].sum().sort_values().index.tolist()
sorted_skills = df.drop('DevType',axis=1).sum().sort_values(ascending=False).droplevel(level=0).index.tolist()

In [12]:
skills =[]

#for each role calc the percentage of each skill
for role in sorted_roles:
    role_mask = (df[('DevType',role)]==1)

    #for each skill column calc the mean of the one hot encoded
    skills_role = pd.concat({tech_col: df.loc[role_mask,tech_col].mean() *100 for tech_col in TECH_COLS})

    skills.append(skills_role)

#concat & format
skills = pd.concat(skills,axis=1)
skills.columns = sorted_roles
skills = skills.reset_index(level=0,drop=True)
skills = skills.loc[sorted_skills]
skills = skills.T

In [13]:
skills

Unnamed: 0,Git,Visual Studio Code,JavaScript,HTML/CSS,Python,SQL,Docker,MySQL,Java,AWS,...,Oracle Cloud Infrastructure,Chef,Flow,F#,TextMate,Erlang,APL,Crystal,COBOL,Pulumi
Marketing or sales professional,70.062696,61.285266,71.943574,71.316614,40.909091,57.053292,32.601881,54.075235,27.586207,43.260188,...,7.366771,5.642633,4.545455,4.388715,5.172414,3.761755,5.642633,4.231975,3.761755,3.761755
Scientist,81.687345,52.109181,42.431762,40.049628,76.079404,34.937965,43.672457,31.91067,25.60794,32.158809,...,2.382134,1.935484,2.034739,2.133995,2.630273,1.935484,2.431762,1.588089,1.637717,0.942928
"Senior Executive (C-Suite, VP, etc.)",84.68854,66.096053,77.223015,63.195435,43.747028,59.771755,58.29767,46.742748,30.147408,59.486448,...,2.56776,2.99572,2.092249,2.948169,3.185925,2.900618,1.997147,1.854494,2.092249,2.234903
"Developer, game or graphics",84.375,72.206439,64.914773,55.160985,46.543561,41.145833,39.299242,43.844697,36.789773,34.753788,...,2.556818,1.846591,2.414773,2.793561,2.367424,2.035985,1.893939,1.373106,1.609848,1.136364
Educator,82.566484,65.512875,66.483748,60.869565,51.87843,50.189954,45.46222,47.868299,35.500211,36.513297,...,2.65935,2.279443,2.532714,2.321655,2.44829,1.857324,2.237231,1.477417,1.7729,1.308569
"Engineer, site reliability",89.011438,67.606209,64.174837,52.205882,60.294118,57.843137,73.202614,49.550654,35.334967,60.825163,...,3.472222,6.699346,2.410131,1.919935,2.165033,3.145425,1.511438,1.633987,1.552288,2.900327
Academic researcher,80.13108,54.294584,44.394619,40.669196,72.093825,34.356675,43.118317,36.081407,28.285616,28.941014,...,2.311142,1.552259,1.759227,1.621249,1.897206,1.552259,2.690583,1.310797,1.552259,0.758882
Product manager,83.702017,70.364346,75.829538,66.070267,41.184125,59.694209,49.414444,50.390371,31.067014,47.462589,...,2.439818,2.407287,2.179571,2.016916,2.179571,1.821731,1.626545,1.919323,1.626545,1.07352
Other (please specify):,75.994358,61.100141,57.122708,50.126939,49.957687,43.638928,39.77433,35.430183,28.716502,32.863188,...,1.720733,1.861777,1.100141,1.607898,1.35402,1.071932,1.664316,1.015515,1.156559,0.818054
"Developer, QA or test",86.901135,69.426752,72.445306,62.088064,47.576849,60.814179,52.783163,49.598449,40.847411,41.678205,...,2.215453,2.326225,2.076987,1.855442,1.66159,1.633896,1.301579,1.384658,1.301579,1.024647


In [14]:
fig = go.Figure(data= go.Heatmap(z=skills,x=skills.columns,y= skills.index,colorscale='magma',ygap=1))
fig.update_layout(width= 2500 ,height=700)
fig.show()

In [15]:
#Jobs dendogram
fig = create_dendrogram(skills,labels=sorted_roles , orientation="left" , color_threshold=0)
fig.update_layout(height = 700 , width=800 ,showlegend=False)
fig.show()

In [16]:
#normalizing features
std_skills = StandardScaler().fit_transform(skills)
std_skills = pd.DataFrame(std_skills,columns=skills.columns,index=skills.index)

In [17]:
fig = go.Figure(data=go.Heatmap(z=std_skills , x= skills.columns , y= skills.index ,colorscale='magma',ygap=1))
fig.update_layout(width = 2500 , height = 700)
fig.show()

In [18]:
#Job profiles
role = np.random.choice(sorted_roles)
role

'Academic researcher'

In [19]:
single_role_skills = pd.concat([skills.loc[role],std_skills.loc[role]],axis=1)
single_role_skills.columns = ['percentage' , 'specificity']
single_role_skills = single_role_skills.sort_values('percentage')

In [20]:
threshold = 25

single_role_skills = single_role_skills[single_role_skills['percentage'] > threshold]

fig = px.bar(df,
             y= single_role_skills.index,
             x = single_role_skills['percentage'],
             color=single_role_skills['specificity'],
             color_continuous_scale='orrd',
             range_color= [std_skills.values.min(),std_skills.values.max()],
             orientation='h')

fig.update_layout(width = 800 ,height = 700 , title=role)
fig.show()