In [1]:
!pip install numpy pandas plotly colorlover nb_black sklearn

Looking in indexes: https://pypi.org/simple, https://alexandr.onbysh%40ring.com:****@artifactory.svc.ring.com/api/pypi/pypi-local/simple
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Import libs and load data

In [2]:
%load_ext nb_black

import numpy as np
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
import colorlover as cl
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import plotly.figure_factory as ff


init_notebook_mode(connected=True)
multiple_choice = pd.read_csv("data/multipleChoiceResponses.csv")


Columns (0,2,8,10,21,23,24,25,26,27,28,44,56,64,83,85,87,107,109,123,125,150,157,172,174,194,210,218,219,223,246,249,262,264,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,304,306,325,326,329,341,368,371,384,385,389,390,391,393,394) have mixed types. Specify dtype option on import or set low_memory=False.



<IPython.core.display.Javascript object>

# Prepearing data

In [3]:
majors_dict = {
    "Computer science (software engineering, etc.)": "Computer Sci & Eng",
    "Engineering (non-computer focused)": "Engineering",
    "Mathematics or statistics": "Mathematics",
    "A business discipline (accounting, economics, finance, etc.)": "Business",
    "Physics or astronomy": "Physics",
    "Information technology, networking, or system administration": "Info Tech",
    "Medical or life sciences (biology, chemistry, medicine, etc.)": "Life Sciences",
    "Social sciences (anthropology, psychology, sociology, etc.)": "Social Sciences",
    "Humanities (history, literature, philosophy, etc.)": "Humanities",
    "Environmental science or geology": "Earth Sciences",
    "Fine arts or performing arts": "Arts",
    "I never declared a major": "Undeclared",
}

degree_dict = {
    "Some college/university study without earning a bachelor’s degree": "Some college/university",
    "No formal education past high school": "High school",
}
industry_dict = {"I am a student": "Student"}
education = (
    multiple_choice[
        (multiple_choice.Q4.notnull())
        & (multiple_choice.Q4 != "I prefer not to answer")
    ]
    .rename(
        columns={
            "Q2": "age",
            "Q3": "country",
            "Q4": "degree",
            "Q5": "major",
            "Q6": "job",
            "Q7": "industry",
            "Q8": "experience",
            "Q9": "salary",
            "Q12_MULTIPLE_CHOICE": "software",
            "Q17": "language",
        }
    )
    .replace({"major": majors_dict, "degree": degree_dict, "industry": industry_dict})
    .loc[
        :,
        [
            "age",
            "country",
            "degree",
            "major",
            "job",
            "industry",
            "experience",
            "salary",
            "software",
            "language",
        ],
    ]
)[1:]

majors = education.major.value_counts()
degrees = education.degree.value_counts()
age = education.age.value_counts()
jobs = education.job.value_counts()
industries = education.industry.value_counts()
experience = education.experience.value_counts()
softwares = education.software.value_counts()
languages = education.language.value_counts()
salaries = education.salary.value_counts()

<IPython.core.display.Javascript object>

# Analyse degrees

In [4]:
degrees_layout = go.Layout(title="Degrees", font=dict(size=15))
data = [go.Pie(labels=degrees.index, values=degrees.values)]
figure = go.Figure(data=data, layout=degrees_layout)
iplot(figure)

<IPython.core.display.Javascript object>

In [5]:
jobs_layout = go.Layout(title="Jobs", font=dict(size=15))
data = [go.Pie(labels=jobs.index, values=jobs.values)]
figure = go.Figure(data=data, layout=jobs_layout)
iplot(figure)

<IPython.core.display.Javascript object>

In [6]:
industries_layout = go.Layout(title="Industries", font=dict(size=15))
data = [go.Pie(labels=industries.index, values=industries.values)]
figure = go.Figure(data=data, layout=industries_layout)
iplot(figure)

<IPython.core.display.Javascript object>

In [7]:
lang = languages / languages.sum() * 100

languages_layout = go.Layout(
    title="Languages",
    font=dict(size=15),
    xaxis=dict(tickangle=45),
    yaxis=dict(title="Percent of Total"),
    margin=dict(b=160),
)

data = [
    go.Bar(
        x=lang.index,
        y=lang.values,
        marker=dict(
            color=lang.values, colorscale="Jet", showscale=True, reversescale=True
        ),
    )
]

figure = go.Figure(data=data, layout=languages_layout)

iplot(figure)

<IPython.core.display.Javascript object>

In [8]:
yrs_dict = {
    "0-1": "0-2",
    "1-2": "0-2",
    "2-3": "2-5",
    "3-4": "2-5",
    "4-5": "2-5",
    "5-10": "5+",
    "10-15": "5+",
    "15-20": "5+",
    "20-25": "5+",
    "25-30": "5+",
    "30 +": "5+",
}
deg_dict = {
    "High school": "No College Degree",
    "Some college/university": "No College Degree",
    "Professional degree": "Undergraduate Degree",
    "Bachelor’s degree": "Undergraduate Degree",
    "Master’s degree": "Graduate Degree",
    "Doctoral degree": "Graduate Degree",
}
sal_dict = {
    "0-10,000": "0-20,000",
    "10-20,000": "0-20,000",
    "20-30,000": "20-40,000",
    "30-40,000": "20-40,000",
    "40-50,000": "40-60,000",
    "50-60,000": "40-60,000",
    "60-70,000": "60-80,000",
    "70-80,000": "60-80,000",
    "80-90,000": "80-100,000",
    "90-100,000": "80-100,000",
    "200-250,000": "200,000+",
    "250-300,000": "200,000+",
    "300-400,000": "200,000+",
    "400-500,000": "200,000+",
    "500,000+": "200,000+",
}

reported_salaries = (
    education[
        (
            education.salary
            != "I do not wish to disclose my approximate yearly compensation"
        )
        & (education.salary.notnull())
        & (education.job != "Student")
        & (education.job != "Not employed")
        & (education.job.notnull())
        & (education.industry != "Student")
        & (education.country == "United States of America")
    ]
    .loc[:, ["degree", "experience", "salary", "job", "industry"]]
    .replace({"experience": yrs_dict, "degree": deg_dict})
)

reported_employment = education[
    (education.job != "Student")
    & (education.job.notnull())
    & (education.industry != "Student")
    & (education.country == "United States of America")
].loc[:, ["degree", "experience", "salary", "job", "industry"]]


reported = pd.Series([reported_salaries.salary.value_counts().sum()]).rename(
    index={0: "Used"}
)
all_salaries_sum = education.salary.fillna("nada").value_counts().sum()
unreported = pd.Series(
    [all_salaries_sum - reported_salaries.salary.value_counts().sum()]
).rename(index={0: "Unused"})

rep = pd.concat([reported, unreported])

sal_idx = [
    "0-10,000",
    "10-20,000",
    "20-30,000",
    "30-40,000",
    "40-50,000",
    "50-60,000",
    "60-70,000",
    "70-80,000",
    "80-90,000",
    "90-100,000",
    "100-125,000",
    "125-150,000",
    "150-200,000",
    "200-250,000",
    "250-300,000",
    "300-400,000",
    "400-500,000",
    "500,000+",
]

yrs_idx = ["0-2", "2-5", "5+"]
deg_idx = ["No College Degree", "Undergraduate Degree", "Graduate Degree"]
color_idx = ["gray", "goldenrod", "firebrick"]

grouped_salaries = reported_salaries.replace({"salary": sal_dict})

group_jobs = grouped_salaries.job.value_counts()
group_inds = grouped_salaries.industry.value_counts()

group_idx = [
    "0-20,000",
    "20-40,000",
    "40-60,000",
    "60-80,000",
    "80-100,000",
    "100-125,000",
    "125-150,000",
    "150-200,000",
    "200,000+",
]

<IPython.core.display.Javascript object>

In [9]:
color_idx = cl.scales["9"]["div"]["Spectral"]
group_idx = [
    "0-20,000",
    "20-40,000",
    "40-60,000",
    "60-80,000",
    "80-100,000",
    "100-125,000",
    "125-150,000",
    "150-200,000",
    "200,000+",
]

count = 0
data = []

for i in group_idx:
    sals = (
        grouped_salaries[grouped_salaries.salary == i]
        .job.value_counts()
        .reindex(group_jobs.index)
    )
    norm = sals / group_jobs.reindex(group_jobs.index) * 100
    trace = go.Bar(
        x=norm.index, y=norm.values, name=i, marker=dict(color=color_idx[count])
    )
    count = count + 1
    data.append(trace)

layout = go.Layout(
    title="Yeary Income [$] Distribution by Job",
    font=dict(size=15),
    xaxis=dict(tickangle=45),
    yaxis=dict(title="Percent of Year Range"),
    barmode="stack",
    margin=dict(b=160),
)
figure = go.Figure(data=data, layout=layout)

iplot(figure)

<IPython.core.display.Javascript object>

# Analyse ML frameworks

In [10]:
question_id = "Q19"
short_question_description = "ML Frameworks"

question_aggregate_col = question_id + "_Total"

full_col_lst = list(multiple_choice.columns)

short_col_lst = [q for q in full_col_lst if (question_id in q) and ("OTHER" not in q)]
short_col_lst_minus_none = short_col_lst[:-2] + short_col_lst[-1:]
short_col_lst_minus_free = short_col_lst[:-1]

df_question = multiple_choice[short_col_lst_minus_none]

df_plus = multiple_choice.copy()
df_plus[question_aggregate_col] = (
    ((df_plus[short_col_lst_minus_none].isnull() == False) * 1).sum(axis=1).astype(int)
)


answered_q = "Answered " + question_id + " Question"
df_plus[answered_q] = (
    ((df_plus[short_col_lst].isnull() == False) * 1).sum(axis=1).astype(int)
)
df_plus[answered_q] = df_plus[answered_q] > 0

df_plus_answered = df_plus[df_plus[answered_q] == True]

<IPython.core.display.Javascript object>

# Popular ML frameworks

In [11]:
d = multiple_choice[short_col_lst]

column_totals = ((d[short_col_lst].isnull() == False) * 1).sum(axis=0)
all_choice_options = [d[~d[col].isnull()][col].max() for col in short_col_lst]
all_choice_options = [
    x.split(
        "What machine learning frameworks have you used in the past 5 years? (Select all that apply) - Selected Choice - "
    )[-1]
    for x in all_choice_options
]
frameworks_layout = go.Layout(
    title="Most popular ML frameworks",
    font=dict(size=15),
    xaxis=dict(tickangle=45),
    yaxis=dict(title="Number of responses"),
)

data = [go.Bar(x=all_choice_options, y=column_totals)]
figure = go.Figure(data=data, layout=frameworks_layout)
iplot(figure)

<IPython.core.display.Javascript object>

# Number of ML frameworks by years of experience

In [12]:
profession = [
    "Student",
    "Data Scientist",
    "Software Engineer",
    "Data Analyst",
    "Research Scientist",
    "Business Analyst",
    "Data Engineer",
    "Research Assistant",
]

years_of_experience = [
    "0-1",
    "1-2",
    "2-3",
    "3-4",
    "4-5",
]

x_col = "Q8"  # Years of experience
y_col = question_aggregate_col
category = "Q6"  # Profession

d = df_plus_answered[
    (df_plus_answered[x_col].isin(years_of_experience[:5]))
    & (df_plus_answered[category].isin(profession))
]


x_label = "Years of Experience"
y_label = "# of " + short_question_description
colors = [
    "rgba(93, 164, 214, 0.5)",
    "rgba(255, 144, 14, 0.5)",
    "rgba(44, 160, 101, 0.5)",
    "rgba(255, 65, 54, 0.5)",
    "rgba(207, 114, 255, 0.5)",
    "rgba(127, 96, 0, 0.5)",
]

fig = go.Figure()

for xd, cls in zip(years_of_experience, colors):
    yd = d[d[x_col] == xd][y_col]
    fig.add_trace(
        go.Box(
            y=yd,
            name=xd,
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker_size=2,
            line_width=1,
        )
    )

fig.update_layout(
    title="Number of ML frameworks by years of experience",
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=5,
        gridcolor="rgb(255, 255, 255)",
        gridwidth=1,
        zerolinecolor="rgb(255, 255, 255)",
        zerolinewidth=2,
        title="Number of frameworks",
    ),
    xaxis=dict(title="Years of experience"),
    margin=dict(l=40, r=30, b=80, t=100,),
    paper_bgcolor="rgb(243, 243, 243)",
    plot_bgcolor="rgb(243, 243, 243)",
)

fig.show()

<IPython.core.display.Javascript object>

# Job description analysis

In [13]:
# data from https://github.com/chuachinhon/ga_project4_cch/blob/master/data/jobs_clean.csv
df = pd.read_csv("data/jobs_clean.csv")
df.head()

Unnamed: 0,Company,Title,Responsibilities,Requirements,Seniority,Employment_Type,Category,Address,Salary_Min,Salary_Max,Salary_Average
0,Hitachi Consulting Singapore,Data Scientist,Problem solver with curious mindset with a hi...,Sound knowledge of machine learning concepts ...,Senior Executive,Permanent,Information Technology,Plaza 8 Cbp 1 Changi Business Park Crescent 4...,6000.0,9000.0,7500.0
1,Traveloka Services,Data Science Lead,As a Data scientist you play a key role to sol...,We are looking for someone with Passion in bi...,Manager,Permanent,Information Technology,Undisclosed,10000.0,15000.0,12500.0
2,Traveloka Services,Senior Data Scientist,Job Description As a Data scientist you will b...,Solid programming skills and understanding o...,Executive,Permanent,Information Technology,Undisclosed,8300.0,15000.0,11650.0
3,Traveloka Services,Mid - Senior Level Data Scientist,As a Data scientist you will be applying machi...,Solid programming skills and understanding o...,Executive,Permanent,Information Technology,Undisclosed,6100.0,10700.0,8400.0
4,Traveloka Services,Mid Level Data Scientist,As a Data scientist you will be applying machi...,Solid programming skills and understanding o...,Executive,Permanent,Information Technology,Undisclosed,4300.0,7600.0,5950.0


<IPython.core.display.Javascript object>

In [14]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform(df["Responsibilities"].tolist())

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)
df_idf = pd.DataFrame(
    tfidf_transformer.idf_, index=cv.get_feature_names(), columns=["idf_weights"]
).sort_values(by="idf_weights", ascending=False)

<IPython.core.display.Javascript object>

In [15]:
vect = CountVectorizer(stop_words="english", min_df=10, ngram_range=(1, 2))
vect.fit(df["Responsibilities"])

responsibility = vect.get_feature_names()
resp = pd.DataFrame(
    vect.transform(df["Responsibilities"]).todense(), columns=responsibility
)

<IPython.core.display.Javascript object>

In [16]:
responsibility_count = resp.sum(axis=0)
ordered = responsibility_count.sort_values(ascending=False).head(30)

layout = go.Layout(
    title="Most common responsibilities from ML job postings",
    font=dict(size=15),
    yaxis=dict(title="number of occurrences"),
    xaxis=dict(title="most common word"),
)

data = [go.Bar(y=ordered.values, x=ordered.index)]
figure = go.Figure(data=data, layout=layout)
iplot(figure)

<IPython.core.display.Javascript object>

In [17]:
cvec = CountVectorizer(stop_words="english", min_df=10, ngram_range=(1, 2))
cvec.fit(df["Requirements"])

requirements = cvec.get_feature_names()
req = pd.DataFrame(cvec.transform(df["Requirements"]).todense(), columns=requirements)
requirement_count = req.sum(axis=0)
ordered = requirement_count.sort_values(ascending=False).head(50)

<IPython.core.display.Javascript object>

In [18]:
layout = go.Layout(
    title="Most common requirements from job postings",
    font=dict(size=15),
    yaxis=dict(title="number of occurrences"),
    xaxis=dict(title="most common requirement"),
)

data = [go.Bar(y=ordered.values, x=ordered.index)]
figure = go.Figure(data=data, layout=layout)
iplot(figure)

<IPython.core.display.Javascript object>

# Coursera Skills

all data scraped using *Open Web Scraper*

In [19]:
flatten = lambda l: [item for sublist in l for item in sublist]

<IPython.core.display.Javascript object>

In [20]:
df = pd.read_csv("data/coursera.csv")[["skills", "title"]].dropna()
raw_skills = df["skills"].dropna().tolist()

skills = []
for raw_skill in raw_skills:
    t = []
    for item in eval(raw_skill):
        if item["skills"] != "" and "Reviews" not in item["skills"]:
            t.append(item["skills"])
    skills.append(t)

top_skills = Counter(flatten(skills)).most_common(20)

<IPython.core.display.Javascript object>

In [21]:
layout = go.Layout(
    title="Most common skills",
    font=dict(size=15),
    yaxis=dict(title="number of occurrences"),
)

data = [go.Bar(x=[i[0] for i in top_skills], y=[i[1] for i in top_skills])]
figure = go.Figure(data=data, layout=layout)
iplot(figure)

<IPython.core.display.Javascript object>

In [22]:
from collections import OrderedDict


def cooccurrence(data):
    names = [x for x in set(flatten(data))]

    occurrences = OrderedDict(
        (name, OrderedDict((name, 0) for name in names)) for name in names
    )

    # Find the co-occurrences:
    for l in data:
        for i in range(len(l)):
            for item in l[:i] + l[i + 1 :]:
                occurrences[l[i]][item] += 1
    names = list(occurrences.keys())

    matrix = []
    for row in occurrences.values():
        matrix.append(list(row.values()))

    return names, matrix

<IPython.core.display.Javascript object>

In [23]:
names, matrix = cooccurrence(skills)
indexies = []
for i, row in enumerate(matrix):
    if sum(row) > 20:
        indexies.append(i)


reduced_m = np.asarray(matrix)[indexies, ...][..., indexies]
np.fill_diagonal(reduced_m, 20)
reduced_n = [n for i, n in enumerate(names) if i in indexies]
reduced_m.shape

(15, 15)

<IPython.core.display.Javascript object>

In [24]:
layout = go.Layout(title="Skills correlation", font=dict(size=15))

data = [go.Heatmap(z=reduced_m, x=reduced_n, y=reduced_n)]
figure = go.Figure(data=data, layout=layout)
iplot(figure)

<IPython.core.display.Javascript object>

In [25]:
layout = go.Layout(title="Clustered ML skills", font=dict(size=15))
fig = ff.create_dendrogram(reduced_m, labels=reduced_n, orientation="left")
fig.update_layout(layout, height=800, width=1000)
iplot(fig)

<IPython.core.display.Javascript object>

## Intro to DS

In [26]:
df = pd.read_csv("data/intro.csv")[["skills", "title"]].dropna()
raw_skills = df["skills"].dropna().tolist()

skills = []
for raw_skill in raw_skills:
    t = []
    for item in eval(raw_skill):
        if (
            item["skills"] != ""
            and "Reviews" not in item["skills"]
            and "JavaScript" not in item["skills"]
        ):
            t.append(item["skills"])
    skills.append(t)

top_skills = Counter(flatten(skills)).most_common(20)

<IPython.core.display.Javascript object>

In [27]:
layout = go.Layout(
    title="Most common skills",
    font=dict(size=15),
    yaxis=dict(title="number of occurrences"),
)

data = [go.Bar(x=[i[0] for i in top_skills], y=[i[1] for i in top_skills])]
figure = go.Figure(data=data, layout=layout)
iplot(figure)

<IPython.core.display.Javascript object>

In [28]:
names, matrix = cooccurrence(skills)
indexies = []
for i, row in enumerate(matrix):
    if sum(row) > 10:
        indexies.append(i)


reduced_m = np.asarray(matrix)[indexies, ...][..., indexies]
np.fill_diagonal(reduced_m, 5)
reduced_n = [n for i, n in enumerate(names) if i in indexies]
reduced_m.shape

(17, 17)

<IPython.core.display.Javascript object>

In [29]:
layout = go.Layout(title="Intro to DS skills correlation", font=dict(size=15))

data = [go.Heatmap(z=reduced_m, x=reduced_n, y=reduced_n)]
figure = go.Figure(data=data, layout=layout)
iplot(figure)

<IPython.core.display.Javascript object>

In [30]:
layout = go.Layout(title="Intro to DS skills", font=dict(size=15))
fig = ff.create_dendrogram(reduced_m, labels=reduced_n, orientation="left")
fig.update_layout(layout, height=800, width=1000)
iplot(fig)

<IPython.core.display.Javascript object>