In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### This notebook is still working in progress

## Introduction

I have done an analysis on the dataset of 2021 survey. Here is the [report](https://medium.com/@tianmin/are-you-an-explorer-a-climber-or-an-expert-in-the-data-science-world-e6c574937f30), if you want to read. This analysis identifies three clusters of 2021 Kaggle survey participants by **k-means clustering** method. By digging further of each cluster, we name them as explorers, climbers and experts, depending on how they respond questions in regards of demographics, professions, their skill and knowledge in data science, the tools they are frequently using and tools they plan to get more familiar in the next two years. In this notebook, I want to reproduce the methodology for the **2022** dataset and try to see if it fits.

## Motivation

This analysis tries to answer -

1. How many types of professionists in the data science field?
2. How does each segment of data science professionsts differ in demographics, professions, their skill and knowledge in data science, the tools they are frequently using and tools they plan to get more familiar in the next two years?

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pyarrow
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# functions
def rename_columns(df):
    """
    input: the dataset we want to rename the columns
    output: combine the first row of the dataset into the original column
    """
    original_columns = df.columns
    num_col = df.shape[1]
    first_row = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data below the first row
    # create a list containing new column names
    new_cols = []
    for col in range(num_col):
        new_col_name = original_columns[col] + '_' + first_row[col]
        new_cols.append(new_col_name)
    df.columns = new_cols # assign the new column names to the dataset
    return df

def replace_nan(df):
    """
    input:
    df - the target dataset
    
    output:
    a new dataset with nan values replaced as 0 and non-nan values replaced with 1
    """
    array = np.where(df.isnull(),0,1)
    df = pd.DataFrame(data=array, columns=df.columns)
    df.index = df.index + 1
    return df

def split_cols(df):
    """
    input:
    df - target dataframe
    
    output:
    single_questions - a list of column names that belong to single question column
    multiple_questions - a list of column names that belong to multiple question column
    """
    single_questions = []
    multiple_questions = []
    for col in df.columns:
        if 'part' in col.lower() or 'other' in col.lower():
            multiple_questions.append(col)
        else:
            single_questions.append(col)
    return single_questions, multiple_questions

def pivot_col(df, col):
    """
    input:
    df - target dataset
    col - the column we want to pivot its value as new columns
    
    output:
    return a pivoted dataframe where columns are value from the col of old dataframe
    """
    df['participant_id'] = df.index
    pivoted_df = df.pivot(index = 'participant_id', columns=col, values=col).reset_index().iloc[: , 1:]
    pivoted_df.index = pivoted_df.index + 1
    return pivoted_df

def pivot_df(df):
    """
    input:
    df - targer dataframe
    var_cols - a list of column names we want to pivot
    aggr - the column we used to group by the dataset
        
    output:
    return a dataframe where each column comes from value of each col of old dataframe
    NaN value replaced with 0 while non-NaN value replaced with 1
    """
    
    pivoted_df = []
    for col in df.columns:
        if col in single_questions: ## single question answers
            pivoted = pivot_col(df[[col]], col)
            pivoted_df.append(pivoted)
        else:
            pivoted_df.append(df[[col]])
    pivoted_merged_df = pd.concat(pivoted_df, axis=1, ignore_index=False)
    return pivoted_merged_df

def closest_participant(participant_id, participant_matrix):
    """
    input:
    participant_id - target participant
    participant_matrix - matrix where shows the similarity between each participant
    
    output - the list of participants other than the target participant, ranked by similarity
    """
    participant_list = participant_matrix[[participant_id]]
    participant_list = participant_list.sort_values(by = participant_id, ascending = False)
    
    return participant_list.index[1:]

def compensation(df , participant):
    """
    input -
    df - target dataset
    participant - the id of the participant
    
    output -
    the yearly compensation of that participant
    """
    
    compensation = df.loc[df.index == participant]['Q25_What is your current yearly compensation (approximate $USD)?'].iloc[0]
    
    return compensation

def similar_user_compensation(df, participant_ids):
    """
    input:
    df - target dataset
    participant_ids - a list of participant ids
    output:
    the first participant id that has non-null compensation data
    """
    for participant in participant_ids:
        if compensation(df, participant) is not None:
            return compensation(df, participant)
        else:
            pass

def same_answers(df, user_1, user_2):
    """
    input
    df - target dataset
    user_1 - index number of user 1
    user_2 - index number of user 2
    
    output
    same_cols - return the column names where answer are same between user 1 and 2
    different_cols - return the column names where answer are different between user 1 and 2
    """
    same_cols = []
    different_cols = []
    answers = df.loc[df.index.isin([user_1,user_2])]
    for col in answers.columns:
        if answers[col].iloc[0] == answers[col].iloc[1]:
            same_cols.append(col)
        else:
            different_cols.append(col)
    return same_cols, different_cols

def compute_correlation(df, user1, user2):
    '''
    INPUT
    user1 - int user_id
    user2 - int user_id
    df - dataset where is a matrix of user and their pivoted answer columns
    OUTPUT
    the correlation between the matching ratings between the two users
    '''
    answer_1 = list(df.loc[df.index == user1].iloc[0])
    answer_2 = list(df.loc[df.index == user2].iloc[0])
    
    dot_product = np.vdot(answer_1, answer_2)
    
    return dot_product #return the correlation

def subset_data(df, col, criteria):
    """
    input:
    df: the dataset we want to subset from
    col: target columns as the filter
    criteria: value to feed the filter
    
    output:
    a new dataset which is a subset of the original one
    """
    
    new_df = df.loc[df[col] == criteria]
    
    return new_df

def question_columns(df, query, method = 'strict'):
    """
    input: 
    df - target dataset
    query - str, query we want to find relevant infomation in the dataset. e.g. 'Q7', or 'machine learning' 
    
    output:
    a subset of data which include the columns of the query in interest
    
    method:
    if it == strict, which means we will look for the question exactly EQUALS to the query. e.g. if we search 'age', then 'language' won't
    be taken into account in this case;
    
    if it == loose, which means we will look for the question exactly CONTAINS the query. e.g. if we search 'age', then 'language' will
    be taken into account in this case.
    """
    columns = df.columns
    question_col = []
    for col in columns:
        if method == 'strict':
            col_parts = col.lower().split() # each column name will be separated into single word tokens at first
            if query.lower() in col_parts:
                question_col.append(col)
        elif method == 'loose':
            if query.lower() in col.lower():
                question_col.append(col)
    return df[question_col]

def kmeans_cluster_opt(df, init = 'k-means++', max_num_cluster = 9):
    """
    input: 
    df - the dataset we want to segments into cluster
    init - the way we want to initialize the starting centroid
    max_num_cluster - the max number of cluster
    
    output:
    a visualization showing the line graph indicating the optimal number of klusters, based on inertias value
    """
    num_clusters = list(range(1, max_num_cluster))
    inertias = []

    for k in num_clusters:
        model = KMeans(init=init, n_clusters=k, random_state = 42)
        model.fit(df)
        inertias.append(model.inertia_)

    
    plt.plot(num_clusters, inertias, '-o')

    plt.xlabel('number of clusters (k)')
    plt.ylabel('inertia')

    plt.show()

def kmeans_predict(df, init = 'k-means++', n_clusters = 4):
    """
    input:
    df - dataset we want to segment into clusters
    init - the way we want to initialize the starting centroid
    n_clusters - the number of cluster
    
    output:
    labels - return an array of predictions on the cluster label of given features
    centers - centroid values of each cluster
    """
    model = KMeans(init=init, n_clusters = n_clusters, random_state = 42)

    model.fit(df)

    labels = model.predict(df)
    
    centers = np.array(model.cluster_centers_)
    
    return labels, centers

def percentage_row(df):
    """
    input:
    df - target dataframe
    
    output - a new dataframe in which each cell represents the row 
    percengatge value of the corresponding one in the target dataframe
    
    """
    new_df = df.div(df.sum(axis=1), axis=0)
    new_df_share = round(new_df.apply(lambda x: x*100), 1).reset_index()
    return new_df_share

def cluster_aggr(df, cols):
    """
    input
    df: target dataset
    
    cols: columns of the question we are interested to see the segmentation
    
    output:
    a new dataframe that contains the number of participants for each question option
    """
    aggr = df.groupby(['cluster']).sum()
    aggr_col = aggr.iloc[:, cols]
    aggr_col = aggr_col.loc[:, (aggr_col != 0).any(axis=0)]
    aggr_col.loc["Total"] = aggr_col.sum()

    
    return aggr_col

def plot_bar_perc(df, cols):
    """
    input:
    df - target dataframe
    cols - columns we want to present as bars in the outcome chart
    
    output:
    a bar chart where each bar represents the share of each value in the column aggregated by cluster
    """
    fig = make_subplots(rows=1, cols=3, 
                    start_cell="bottom-left", 
                        shared_yaxes=True,
                    subplot_titles=(cluster_title))

    clusters = df.index.tolist()

    options = df.columns[1:]

    colors = single_blue * len(options)

    for c in range(len(clusters)):
        data = df.loc[df['cluster'] == clusters[c]] 
        titles = []
        for o in range(len(options)):
            titles.append(options[o])
            fig.add_trace(go.Bar(x=[options[o].split("- Selected Choice -")[-1].strip()], 
                             y=data[options[o]],
                             marker_color = colors[o],
                                name = ""),
                              row=1, col=c+1)
        
    fig.update_layout(
        title=titles[0].split("- Selected Choice -")[0],
        yaxis_title="% of participants",
        showlegend=False)
    #fig.write_image("visualizations/" + str(cols[1])[:8] + ".jpeg")
    fig.show()
    
def cluster_question_plot(df, question):
    """
    input:
    df - target dataset
    
    question - the question we are interested to segmented by the cluster
    
    output:
    a list which contains a table and a plot showing the share of each segment per cluster
    """
    aggr_data = cluster_aggr(df, range(qs_num[question][0], qs_num[question][1]))
    aggr_perc = percentage_row(aggr_data)
    plot_data = aggr_perc.loc[aggr_perc['cluster'].isin([0,1,2])]
    
    plot_chart = plot_bar_perc(plot_data, plot_data.columns[1:])
    
    return aggr_perc, plot_chart

def plot_bar_rank(df, cols, num_col = 10):
    """
    input
    df: target dataframe
    cols : Question you want to aggregate
    num_col: number of options shown in the chart
    
    output:
    return a bar chart where options with highest total shares are set at the left side
    """
    data_aggr = cluster_aggr(df, range(qs_num[cols][0], qs_num[cols][1]))
    data_aggr = percentage_row(data_aggr)
    data_aggr_rank = rank_total(data_aggr)

    top_data_aggr_rank_cols = ['cluster']
    for col in data_aggr_rank.columns:
        top_data_aggr_rank_cols.append(col)
    
    aggr_cols = data_aggr.columns

    data_aggr = data_aggr.loc[data_aggr['cluster'].isin([0,1,2])]

    plot_bar_perc(data_aggr[top_data_aggr_rank_cols[:num_col]], aggr_cols)

def std_cluster(df):
    """
    input:
    df - target dataframe
    output:
    std - standard deviation of each row per cluster
    """
    std = df.iloc[:,1:].std(axis=1)
    return std

def rank_total(df):
    """
    input: target dataframe
    
    output: a new dataframe which columns are ranked by the value in the Total row, so higher values are set at the left side
    """
    df = df.iloc[:,1:] # remove cluster column
    df_ranked = df.sort_values(by = 3, axis=1 , ascending = False)
    return df_ranked

def find_correlation_rank(df,col,ascending = False):
    """
    input:
    df - target dataframe
    col - column in interest
    
    output:
    a list of columns in which the highest positive correlated col ranks the first
    """
    df_ranked = df[[col]].sort_values(by = col,ascending = ascending)
    
    return df_ranked

def remove_col_zero(df):
    """
    input:
    df - target dataframe
    
    output:
    a new dataframe that has all columns of sum 0 removed from the df
    """
    new_df = df.loc[(df.sum(axis=1) != 0), (df.sum(axis=0) != 0)]
    
    return new_df

In [None]:
# color palatte for visualization
shades_blue = ['#90EE90','#00FF7F','#00FFFF','#89CFF0','#1434A4','#0096FF',
               '#6495ED','#1F51FF','#2F4F4F','#A7C7E7','#00008B']

single_blue = ['#89CFF0']

In [None]:
# load dataset
data = pd.read_csv("/kaggle/input/kaggle-survey-2022/kaggle_survey_2022_responses.csv")

# first five rows
# data.head()

# remove the column Time from Start to Finish (seconds)
data = data.iloc[: , 1:]

# size of the dataset
# data.shape # 23,998 rows, 295 columns

In [None]:
print("The dataset has " + str(data.shape[0]) + " rows.")

print("The dataset has " + str(data.shape[1]) + " columns.")

## Data Cleaning

The dataset is from Kaggle Machine Learning & Data Science Survey. According to the competition host, it has collected 23,998 valid answers from Kaggle users. Kaggle is a free online data science community where participants could attend data science competitions. Its annual survey is representative to understand professionists in the data science world.

In [None]:
# make question as columne names
renamed_data = rename_columns(data)

# group questions into two categories
# single_questions if it is a single answer question
# multiple_questions if it is a multiple answer question
single_questions = split_cols(renamed_data)[0]
multiple_questions = split_cols(renamed_data)[1]

# replace values for simplification
renamed_data = renamed_data.replace("Prefer to self-describe", "self describe")
renamed_data = renamed_data.replace("United Kingdom of Great Britain and Northern Ireland", "UK")
renamed_data = renamed_data.replace("United States of America", "the U.S.")

# add question into the value of single option quetions
# This is aligning the format of single option questions with the multiple option ones.
for col in renamed_data.columns:
    if col in single_questions:
        renamed_data[col] = col + "- Selected Choice -" + renamed_data[col]
    else:
        pass

# pivot the dataset to one option one colum
pivoted_data = pivot_df(renamed_data)

# turn answer as binary data where chosen is 1 and not chosen is 0 
replace_nan_data = replace_nan(pivoted_data)

# remove columns that have sum as 0
binary_data = remove_col_zero(replace_nan_data)


In [None]:
### Question and its number of columns
qs_num = {
    "Age" : [0,11], #
    "Gender" : [11,16], #
    "Country" : [16,74], #
    "IsStudent" : [74,76], #  
    "LearnPlt" : [76,88], #
    "FirstLearn" : [88,95], #
    "HighEdu" : [95,102], #
    "IsPublish" : [102,104], #  
    "ResML" : [104,107], #  
    "CodeExp" : [107,114],#
    "ProgLangReg" : [114,129], #
    "IDE" : [129,143], #
    "HostNotebook" : [143, 159], #
    "VisualLib" : [159,174], #
    "MLmethd" : [174,183], #
    "MLframe" : [183,198], #
    "MLalgorithm" : [198,212], #
    "CompVis" : [212,220], #
    "NLP" : [220,226], #
    "PreTrainWgt" : [226,236], #
    "MLhub" : [236,245], #
    "Employment" : [245,260], #
    "Industry" : [260,275], #
    "SizeEmployer" : [275,280], #
    "SizeDS" : [280,287], #?
    "DSBusiness" : [287,293], #
    "WorkAct" : [293,301], #
    "Compensation" : [301,327], #
    "InvestDS" : [327, 333], #
    "CldCompPltReg" : [333,345], #
    "CldCompPltBstExp" : [345,358], #
    "CldCompProdReg" : [358,363], #
    "DataStoreProdReg" : [363,371], #
    "BigDataProdReg" : [371,387], #
    "IntegenceReg" : [387,402], #
    "ManageMLProdReg" : [402,415], #
    "AutoMLReg" : [415,423], #
    "MLModelProd" : [423,435], #
    "MLMonitor" : [435,450], #
    "AIEthic" : [450,459], #
    "HardwareReg" : [459,468], #
    "TPUtimes" : [468,473], #
    "FavMedia" : [473,485] #
}

## Method

This analysis adopts K-means to find participant clusters based on the pattern how they respond to the survey. It aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean (cluster centers or cluster centroid), serving as a prototype of the cluster [link](https://en.wikipedia.org/wiki/K-means_clustering).

In [None]:
# explore how many number cluster can give a small enough inertia and also be as small number as possible
kmeans_cluster_opt(binary_data)

In [None]:
# create a new column cluster segmenting participants
# As the chart indicates above, we choose to make 3 clusters
binary_data['cluster'] = kmeans_predict(binary_data , n_clusters = 3)[0]

In [None]:
# for visualization use later
# cluster 0 are explorers
# cluster 1 are climbers
# cluster 2 are experts
cluster_title = ["explorers", "climbers", "experts"]

## Analysis
First of all, let's take a look at how many participants per each cluster.

In [None]:
binary_data.groupby(['cluster']).size()

We have 10,633 explorers, 8,723 climbers and 4,641 experts.

### Demographics

Climbers and explorers have higher shares at student age zone, which is between 18-24 year old than expert particitpants. For experts, the highest share is 25-29, ~18% of total, which might imply they have more working experience or higher educational levels.

In [None]:
age_aggr = cluster_aggr(binary_data, range(qs_num["Age"][0], qs_num["Age"][1]))
age_aggr = percentage_row(age_aggr)

cols = age_aggr.columns

cols = ['18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-69','70+']

age_aggr = age_aggr.loc[age_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(age_aggr, cols)

Regardless of the cluster, men practitioners are dominant in the industry. The percentage gap betweeen man and other categories go even wider in the experts group.

In [None]:
plot_bar_rank(binary_data, "Gender",10)

India has the highest share of participants in each cluster, followed by the Unitde States. The gap between India and the U.S. comes to the closest in the experts cluster.

In [None]:
plot_bar_rank(binary_data, "Country",15)

All experts are no students and most of the climbers are students. For explorers, students have a slight higher share than non-student.

In [None]:
plot_bar_rank(binary_data, "IsStudent",10)

Experts have the highest share in Doctoral and Master degree than any other two.

In [None]:
# order the columns in a reasonable manner

edu_aggr = cluster_aggr(binary_data, range(qs_num["HighEdu"][0], qs_num["HighEdu"][1]))
edu_aggr = percentage_row(edu_aggr)

cols = edu_aggr.columns.tolist()

cols_order = [
 'cluster',
 'No formal education past high school',
 'Some college/university study without earning a bachelor’s degree',
 'Bachelor’s degree',
 'Master’s degree',
 'Professional doctorate',
 'Doctoral degree',
 'I prefer not to answer'
 ]

cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)

edu_aggr = edu_aggr[cols_final]

edu_aggr = edu_aggr.loc[edu_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(edu_aggr, cols)

With no doubt, experts have the highest share that some of them have research paper published.

In [None]:
plot_bar_rank(binary_data, "IsPublish",10)

Explorers have the largest share reporting their published research paper has nothing to do with machine learning.
The choice proportion of climbers and experts are relatively close in this question.

In [None]:
res_paper = cluster_aggr(binary_data, range(qs_num["ResML"][0], qs_num["ResML"][1]))
res_paper = percentage_row(res_paper)

cols = res_paper.columns.tolist()

cols_order = [
 'cluster',
 'No',
 'Yes, the research made use of machine learning as a tool (applied research)',
 'Yes, the research made advances related to some novel machine learning method (theoretical research)'
 ]

cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)

res_paper = res_paper[cols_final]

res_paper = res_paper.loc[res_paper['cluster'].isin([0,1,2])]

plot_bar_perc(res_paper, cols)

It's interesting to see university courses are never in top 3 for any cluster as their first helpful learning source into data science. Experts and explorers have online courses as the top 1 while climbers see Kaggle as the top priority.

### Skillset and knowledge

Climbers and experts have coding experience more or less, while ~20% of explorers have not coded before. Experts have more shares of veterans than climbers.

In [None]:
codeExp_aggr = cluster_aggr(binary_data, range(qs_num["CodeExp"][0], qs_num["CodeExp"][1]))
codeExp_aggr = percentage_row(codeExp_aggr)

cols = codeExp_aggr.columns.tolist()

cols_order = ['cluster',
 'I have never written code',
 '< 1 years',
 '1-3 years',
 '3-5 years',
 '5-10 years',
 '10-20 years',
 '20+ years']

cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)
            
codeExp_aggr = codeExp_aggr[cols_final]

codeExp_aggr = codeExp_aggr.loc[codeExp_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(codeExp_aggr, cols)

Programming languages that experts have reported with higher share than any other cluster are SQL, R, Javascript and Bash. For SQL and R, the reasons could be those are the most frequently used tools for data manipulation and statistics, and therefore Experts have more experience with. For JS, the share is very close among the threes and I doubt there is significant difference. For Bash, it's one of the most common languages if you want to develop something into production.

In [None]:
plot_bar_rank(binary_data, "ProgLangReg",10)

Explorers have the highest share that reports using none visualization library and it's probably due to their highest percentage of participants have no coding experience. Compared with climbers, experts have more evenly distributed usage of visualization libraries.

In [None]:
plot_bar_rank(binary_data, "VisualLib",15)

In [None]:
MLmethd_aggr = cluster_aggr(binary_data, range(qs_num["MLmethd"][0], qs_num["MLmethd"][1]))
MLmethd_aggr = percentage_row(MLmethd_aggr)

cols = MLmethd_aggr.columns.tolist()
cols_order = ['cluster',
 'I do not use machine learning methods',
 'Under 1 year',
 '1-2 years',
 '2-3 years',
 '3-4 years',
 '4-5 years',
 '5-10 years',
 '10-20 years',
 '20 or more years']

cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)

MLmethd_aggr = MLmethd_aggr[cols_final]

MLmethd_aggr = MLmethd_aggr.loc[MLmethd_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(MLmethd_aggr, cols)

Experts tend to use less popular Machine Learning frameworks, compared with climbers.

In [None]:
plot_bar_rank(binary_data, "MLframe",18)

In [None]:
plot_bar_rank(binary_data, "MLalgorithm",15)

In [None]:
plot_bar_rank(binary_data, "CompVis",15)

In [None]:
plot_bar_rank(binary_data, "NLP",15)

### Profession

Climbers has the highest share participants are not employed currently, because they have a highest student share. For experts, the largest share goes to data scientists and they also have higher shares in other two close options, machine learning engineers and research scientist. Experts also have higher share in teacher/professor, and that's probably why they have highest share owning publishing experience.

In [None]:
plot_bar_rank(binary_data, "Employment",10)

In [None]:
# plot_bar_rank(binary_data, "WorkAct",10)
# neeed to clean the x axis

Experts have the highest share go to Computers and Tech, which is also the top one for the other two clusters.

Other than that, Accounting and Finance and Medical aree also two industries where experts are more likely to work in.

In [None]:
plot_bar_rank(binary_data, "Industry",25)

In [None]:
size_emp_aggr = cluster_aggr(binary_data, range(qs_num["SizeEmployer"][0], qs_num["SizeEmployer"][1]))
size_emp_aggr = percentage_row(size_emp_aggr)

cols = size_emp_aggr.columns.tolist()


cols_order = ['cluster',
 '0-49 employees',
 '50-249 employees',
 '250-999 employees',
 '1000-9,999 employees',
 '10,000 or more employees']


cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)

size_emp_aggr = size_emp_aggr[cols_final]

size_emp_aggr = size_emp_aggr.loc[size_emp_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(size_emp_aggr, cols)

Experts are more likely to work in employers that havee applied machine learning into production.

In [None]:
ds_business_aggr = cluster_aggr(binary_data, range(qs_num["DSBusiness"][0], qs_num["DSBusiness"][1]))
ds_business_aggr = percentage_row(ds_business_aggr)

cols = ds_business_aggr.columns.tolist()

cols_order = ['cluster',
 'No (we do not use ML methods)',
 'We use ML methods for generating insights (but do not put working models into production)',
 'We are exploring ML methods (and may one day put a model into production)',
 'We recently started using ML methods (i.e., models in production for less than 2 years)',
 'We have well established ML methods (i.e., models in production for more than 2 years)',
 'I do not know']

cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)

ds_business_aggr = ds_business_aggr[cols_final]

ds_business_aggr = ds_business_aggr.loc[ds_business_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(ds_business_aggr, cols)

Experts earn more.

In [None]:
compensation_aggr = cluster_aggr(binary_data, range(qs_num["Compensation"][0], qs_num["Compensation"][1]))
compensation_aggr = percentage_row(compensation_aggr)

cols = compensation_aggr.columns.tolist()
cols_order = ['cluster',
 '$0-999',
 '1,000-1,999',
 '2,000-2,999',
 '3,000-3,999',
 '4,000-4,999',
 '5,000-7,499',
 '7,500-9,999',
 '10,000-14,999',
 '15,000-19,999',
 '20,000-24,999',
 '25,000-29,999',
 '30,000-39,999',
 '40,000-49,999',
 '50,000-59,999',
 '60,000-69,999',
 '70,000-79,999',
 '80,000-89,999',
 '90,000-99,999',
 '100,000-124,999',
 '125,000-149,999',
 '150,000-199,999',
 '200,000-249,999',
 '250,000-299,999',
 '300,000-499,999',
 '$500,000-999,999',
 '>$1,000,000']

cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)

compensation_aggr = compensation_aggr[cols_final]

compensation_aggr = compensation_aggr.loc[compensation_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(compensation_aggr, cols)

### Tools

Experts have high share reporting that they used Jupyterlab (what is the difference between this one Jupyter Notebook?), and Vim / Emacs. It's related with their regular programming language.

In [None]:
plot_bar_rank(binary_data, "IDE",15)

In [None]:
plot_bar_rank(binary_data, "HostNotebook",20)

In [None]:
invest_ds_aggr = cluster_aggr(binary_data, range(qs_num["InvestDS"][0], qs_num["InvestDS"][1]))
invest_ds_aggr = percentage_row(invest_ds_aggr)

cols = invest_ds_aggr.columns.tolist()
cols_order = ['cluster',
 '$0 ($USD)',
 '$1-$99',
 '$100-$999',
 '$1000-$9,999',
 '$10,000-$99,999',
 '$100,000 or more ($USD)']

cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)
            
invest_ds_aggr = invest_ds_aggr[cols_final]

invest_ds_aggr = invest_ds_aggr.loc[invest_ds_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(invest_ds_aggr, cols)

In [None]:
plot_bar_rank(binary_data, "HardwareReg",10)

In [None]:
TPUtimes_aggr = cluster_aggr(binary_data, range(qs_num["TPUtimes"][0], qs_num["TPUtimes"][1]))
TPUtimes_aggr = percentage_row(TPUtimes_aggr)

cols = TPUtimes_aggr.columns.tolist()
cols_order = ['cluster',
        'Never',
        'Once',
        '2-5 times',
        '6-25 times',
        'More than 25 times']

cols_final = []
for col_1 in cols_order:
    col_1_index = cols_order.index(col_1)
    for col_2 in cols:
        if col_1 in col_2:
            cols_final.insert(col_1_index, col_2)

TPUtimes_aggr = TPUtimes_aggr[cols_final]

TPUtimes_aggr = TPUtimes_aggr.loc[TPUtimes_aggr['cluster'].isin([0,1,2])]

plot_bar_perc(TPUtimes_aggr, cols)

In [None]:
plot_bar_rank(binary_data, "PreTrainWgt",15)

In [None]:
plot_bar_rank(binary_data, "MLhub",15)

In [None]:
plot_bar_rank(binary_data, "CldCompPltReg",15)

In [None]:
plot_bar_rank(binary_data, "CldCompPltBstExp",15)

In [None]:
plot_bar_rank(binary_data, "CldCompProdReg",15)

In [None]:
plot_bar_rank(binary_data, "DataStoreProdReg",15)

In [None]:
plot_bar_rank(binary_data, "BigDataProdReg",15)

In [None]:
plot_bar_rank(binary_data, "IntegenceReg",15)

In [None]:
plot_bar_rank(binary_data, "ManageMLProdReg",15)

In [None]:
plot_bar_rank(binary_data, "AutoMLReg",15)

In [None]:
plot_bar_rank(binary_data, "MLModelProd",15)

In [None]:
plot_bar_rank(binary_data, "MLMonitor",15)

In [None]:
plot_bar_rank(binary_data, "AIEthic",15)

### Learning and development

In [None]:
plot_bar_rank(binary_data, "FirstLearn",10)

Coursera is the most popular among three clusters. For explorers, they have the highest share having None to learn, ~10%, which is almost 5 times as the equivalent of climbers and experts. Climbers prefer university courses as the second option since they have the highest share being students, on the other hand, experts also learn online courses on Udemy.

In [None]:
plot_bar_rank(binary_data, "LearnPlt",10)

Explorers prefer watching Youtube videos to teach themselves, while climbers and experts love playing at Kaggle. Compared with climbers, experts are more likely to read Blogs or Published Journals.

In [None]:
plot_bar_rank(binary_data, "FavMedia",15)

I will try to analyze the segments based on the result in the upcoming days. Stayed tuned!

Good luck, everyone!