# **Analytics**

----------

In [15]:
DF_PATH = "../data/processed/1_preprocessed_df.pkl"
ROLES_PATH = "../data/raw/roles_short_names.csv"
FIG_DIR = "../reports/figures"

NA_STRING = "Not Specified"
TRANSPARENT_STRING = 'rbga(0,0,0,0)'

ROLES_COLS = ['DevType']
TECH_COLS = ['LanguageHaveWorkedWith',
             'LanguageWantToWorkWith', 
             'DatabaseHaveWorkedWith',
             'PlatformHaveWorkedWith',
             'PlatformWantToWorkWith',
             'WebframeHaveWorkedWith', 
             'WebframeWantToWorkWith',
             'MiscTechHaveWorkedWith',
             'MiscTechWantToWorkWith',
             'ToolsTechHaveWorkedWith', 
             'ToolsTechWantToWorkWith',
             "NEWCollabToolsHaveWorkedWith",
             "NEWCollabToolsWantToWorkWith"]

------------

## Load Data

In [16]:
# Libraries 
import pandas as pd 
import numpy as np 
import logging
import pickle 
import os 

# Visualisation Libraries
import plotly.graph_objects as go 
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
from matplotlib import pyplot as plt

# Analysis & preproccessing Libraries
from sklearn.manifold import TSNE
from sklearn.preprocessing import MultiLabelBinarizer , StandardScaler , RobustScaler , MinMaxScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.pipeline import make_pipeline
from scipy.cluster.hierarchy import dendrogram , fcluster , linkage

------------

## Reading Data

In [17]:
raw_df = pd.read_pickle(DF_PATH)
roles_names = pd.read_csv(ROLES_PATH , sep=";")

In [18]:
df = raw_df.copy()
# checking the dataframe    
df.sample(4)

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
14723,14724,I am a developer by profession,Employed full-time,India,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",18 - 24 years,"[Other online resources (ex: videos, blogs, et...",4.0,...,18-24 years old,[Man],No,[],[I don't know],[None of the above],[None of the above],Too long,Easy,134028.0
19963,19964,I am a developer by profession,Employed full-time,Canada,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",18 - 24 years,[School],10.0,...,25-34 years old,[Man],No,[Straight / Heterosexual],[White or of European descent],[None of the above],[None of the above],Appropriate in length,Easy,60505.0
43259,43260,I am a developer by profession,"Student, part-time",Italy,,,Something else,11 - 17 years,[School],7.0,...,18-24 years old,[Man],No,[Straight / Heterosexual],[White or of European descent],[None of the above],[None of the above],Appropriate in length,Easy,
75656,75657,I am a student who is learning to code,"Student, full-time",China,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",18 - 24 years,[School],3.0,...,18-24 years old,[Man],No,[Bisexual],[East Asian],[None of the above],[None of the above],Too long,Neither easy nor difficult,


In [19]:
# Take a look at the roles names
roles_names

Unnamed: 0,Original name,Short name
0,"Developer, back-end",Back-end dev
1,"Developer, full-stack",Full-stack dev
2,"Developer, front-end",Front-end dev
3,"Developer, desktop or enterprise applications",Desktop dev
4,"Developer, mobile",Mobile dev
5,DevOps specialist,DevOps
6,Database administrator,Database admin
7,Designer,Designer
8,System administrator,System admin
9,"Developer, embedded applications or devices",Embedded dev


------------

## One hot encoding 

In [20]:
encoded_dfs = {}
for col in ROLES_COLS + TECH_COLS:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col]), 
                              columns = binarizer.classes_ , 
                              index = df[col].index)
    encoded_dfs[col] = encoded_df

In [26]:
# merge 1-hot encoded 
df = pd.concat(encoded_dfs , axis = 1)
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith,NEWCollabToolsWantToWorkWith
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,Designer,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices",...,RStudio,Rider,RubyMine,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
83435,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0
83436,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
83437,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Look at the languages column to verify encoding process  
df['LanguageHaveWorkedWith'].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83429,83430,83431,83432,83433,83434,83435,83436,83437,83438
APL,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Assembly,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bash/Shell,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
C,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C#,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
C++,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
COBOL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Clojure,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
Crystal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dart,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


-------

## Display Jobs Frequency  

In [28]:
jobs_freq = df['DevType'].copy().sum().sort_values().reset_index()
jobs_freq

Unnamed: 0,index,0
0,Marketing or sales professional,638
1,Scientist,2015
2,"Senior Executive (C-Suite, VP, etc.)",2103
3,"Developer, game or graphics",2112
4,Educator,2369
5,"Engineer, site reliability",2448
6,Academic researcher,2899
7,Product manager,3074
8,Other (please specify):,3545
9,"Developer, QA or test",3611


--------

## Display Skills Frequency

In [39]:
# Calculate the sum for all cols except DevTypes
skills_freq = df.copy().drop('DevType', axis = 1).sum().reset_index()
skills_freq.columns = ['group' , 'skill' , 'freq']
sorted_skills = skills_freq.sort_values('freq' ,ascending=False)
sorted_skills

Unnamed: 0,group,skill,freq
171,ToolsTechHaveWorkedWith,Git,68171
185,ToolsTechWantToWorkWith,Git,58308
212,NEWCollabToolsHaveWorkedWith,Visual Studio Code,58026
19,LanguageHaveWorkedWith,JavaScript,53587
233,NEWCollabToolsWantToWorkWith,Visual Studio Code,48360
...,...,...,...
8,LanguageHaveWorkedWith,Crystal,466
6,LanguageHaveWorkedWith,COBOL,437
230,NEWCollabToolsWantToWorkWith,TextMate,396
173,ToolsTechHaveWorkedWith,Pulumi,368


In [40]:
fig = px.treemap(skills_freq , 
                 path= ['group', 'skill'],
                 color_continuous_scale= 'deep', 
                 values = 'freq' , color = 'freq')
fig.update_layout(width = 1400 , height = 700 )
fig.show()
fig.write_html(os.path.join(FIG_DIR , 'treemap_skills_freq.html'))

- `here from this treemap we can see that the most popular skills are in the data science field`

- `It's dynamic tree map we can interact with it and to know more about the skills by click to the treemap` 

--------

## Create Jobs & Skills Heatmap