In [53]:
# Constants
DATA_PATH   = "../Data/Processed/1_preprocessed_df.pkl"


TECH_JOBS = ['Cloud infrastructure engineer',
             'Developer, embedded applications or devices',
             'Data or business analyst',
             'System administrator',
             'Developer, front-end',
             'Scientist',
             'DevOps specialist',
             'Developer, game or graphics',
             'Academic researcher',
             'Security professional',
             'Developer, QA or test',
             'Blockchain',
             'Developer, full-stack',
             'Data scientist or machine learning specialist',
             'Developer, mobile',
             'Developer, desktop or enterprise applications',
             'Developer, back-end',
             'Database administrator',
             'Engineer, data']

CORE_COLS = ['VersionControlSystem',
             'Languages',
             'Databases',
             'Platforms',
             'WebFrameworks',
             'MiscTech',
             'ToolsTech',
             'CollabTools'
]

USEFUL_COLS= ['Employment','RemoteWork',
              'MainBranch','CodingActivities','ProfessionalTech',
              'LearnCode', 'LearnCodeOnline', 'LearnCodeCoursesCert',
              'WorkExp', 'YearsCode', 'YearsCodePro', 'EdLevel',
              'OrgSize', 'Country',
              'ConvertedCompYearly', 'Currency', 'CompTotal', 'CompFreq']

In [54]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle

from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

_______

### Functions

In [55]:
# Create a Folder named Images to save figures in.
IMAGES_PATH = Path.cwd().parent / "Images"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    """
    This functions will save the current figure shown below.
    
    Args:
        fig_id: String Containing the name of the figure.
        tight_layout: Boolean to decide whether you want a tight layout or not.
        fig_extension: String to decide the type of the figure.
        resoultion: Int to decide the resolution of the figure.
        
    Returns:
        None
    """
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    
    if tight_layout:
        plt.tight_layout()
        
    plt.savefig(path, format=fig_extension, dpi=resolution)

_____

In [111]:
df['MainBranch'].to_string()


'2        [I am not primarily a developer, but I write c...\n3                         [I am a developer by profession]\n4                         [I am a developer by profession]\n7                         [I am a developer by profession]\n8                         [I am a developer by profession]\n9                         [I am a developer by profession]\n10                        [I am a developer by profession]\n12                        [I am a developer by profession]\n13                        [I am a developer by profession]\n14                        [I am a developer by profession]\n15                        [I am a developer by profession]\n16                        [I am a developer by profession]\n17                        [I am a developer by profession]\n18                        [I am a developer by profession]\n21                        [I am a developer by profession]\n22                        [I am a developer by profession]\n24                        [I am a devel

### Questions we need to answer

- **Jobs:**
    - **What is the frequency of each job?
    - **How are the Jobs correlated with each other?
- **Skills:**
    - **What is the frequency of each skill?
    - **How are skills correlated with each other?
- **Jobs & Skills relationship:**
    - **How are skills correlated to Jobs?
    - **What is the specificity of each skill to a job?





General:
• Total number of answers
• Geographical distributions
• Missing answers
Jobs:
• Frequency of each job
• How are the jobs correlated with each others
Relation:
Frequency of each job
• How are the jobs correlated with each others
Skills:• How are the skills correlated with the jobs
• What is the specificity of each skill to a job




In [56]:
# Load dataset and make a copy
prep_df = pd.read_pickle(DATA_PATH)
df = prep_df.copy()

In [57]:
df.columns

Index(['Cloud infrastructure engineer',
       'Developer, embedded applications or devices',
       'Data or business analyst', 'System administrator',
       'Developer, front-end', 'Scientist', 'DevOps specialist',
       'Developer, game or graphics', 'Academic researcher',
       'Security professional', 'Developer, QA or test', 'Blockchain',
       'Developer, full-stack',
       'Data scientist or machine learning specialist', 'Developer, mobile',
       'Developer, desktop or enterprise applications', 'Developer, back-end',
       'Database administrator', 'Engineer, data', 'VersionControlSystem',
       'Languages', 'Databases', 'Platforms', 'WebFrameworks', 'MiscTech',
       'ToolsTech', 'CollabTools', 'Employment', 'RemoteWork', 'MainBranch',
       'CodingActivities', 'ProfessionalTech', 'LearnCode', 'LearnCodeOnline',
       'LearnCodeCoursesCert', 'WorkExp', 'YearsCode', 'YearsCodePro',
       'EdLevel', 'OrgSize', 'Country', 'ConvertedCompYearly', 'Currency',
       'Co

In [23]:
# create list for numerical and categorical columns 
cat_cols = list(df.select_dtypes(include=['object']).columns)
num_cols = list(df.select_dtypes(exclude=['object']).columns)

In [61]:
df[categorical_features] = df[categorical_features].applymap(lambda x: x if isinstance(x, list) else [])

In [73]:
from sklearn.preprocessing import MultiLabelBinarizer
encoded_dfs = {}
for col in categorical_features:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col]),
                               columns=binarizer.classes_,
                               index=df[col].index)
    encoded_dfs[col] = encoded_df
len(encoded_df)

57284

In [74]:
encoded_dfs['Tech_job'] = df[TECH_JOBS]

In [76]:
encoded_dfs = pd.concat(encoded_dfs, axis=1)

In [79]:
encoded_dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57284 entries, 2 to 73267
Columns: 185 entries, ('VersionControlSystem', 'Git') to ('Tech_job', 'Engineer, data')
dtypes: int32(185)
memory usage: 40.9 MB
