# Kaggle Survey Dataset

## Dataset

In [2]:
import pandas as pd
import numpy as np
import zipfile
import os

In [3]:
folder = r"C:\Users\akhil\Personal\Projects\repo\datasets\data"
zipname = "kaggle-survey-2018.zip"
file = "multipleChoiceResponses.csv"

In [4]:
with zipfile.ZipFile(os.path.join(folder, zipname)) as z_:
    df = pd.read_csv(z_.open(file), low_memory=False)

In [5]:
df.shape

(23860, 395)

## EDA

In [5]:
dict(zip(df.head().columns, df.head().iloc[0].values))

{'Time from Start to Finish (seconds)': 'Duration (in seconds)',
 'Q1': 'What is your gender? - Selected Choice',
 'Q1_OTHER_TEXT': 'What is your gender? - Prefer to self-describe - Text',
 'Q2': 'What is your age (# years)?',
 'Q3': 'In which country do you currently reside?',
 'Q4': 'What is the highest level of formal education that you have attained or plan to attain within the next 2 years?',
 'Q5': 'Which best describes your undergraduate major? - Selected Choice',
 'Q6': 'Select the title most similar to your current role (or most recent title if retired): - Selected Choice',
 'Q6_OTHER_TEXT': 'Select the title most similar to your current role (or most recent title if retired): - Other - Text',
 'Q7': 'In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice',
 'Q7_OTHER_TEXT': 'In what industry is your current employer/contract (or your most recent employer if retired)? - Other - Text',
 'Q8': 'How many years of experience

In [6]:
df_ = df.copy(deep=True)

In [20]:
df['Q4'].value_counts()

Master’s degree                                                                                                    10855
Bachelor’s degree                                                                                                   7083
Doctoral degree                                                                                                     3357
Some college/university study without earning a bachelor’s degree                                                    967
Professional degree                                                                                                  599
I prefer not to answer                                                                                               345
No formal education past high school                                                                                 232
What is the highest level of formal education that you have attained or plan to attain within the next 2 years?        1
Name: Q4, dtype: int64

In [15]:
df['Q5'].value_counts()

Computer science (software engineering, etc.)                       9430
Engineering (non-computer focused)                                  3705
Mathematics or statistics                                           2950
A business discipline (accounting, economics, finance, etc.)        1791
Physics or astronomy                                                1110
Information technology, networking, or system administration        1029
Medical or life sciences (biology, chemistry, medicine, etc.)        871
Other                                                                770
Social sciences (anthropology, psychology, sociology, etc.)          554
Humanities (history, literature, philosophy, etc.)                   269
Environmental science or geology                                     253
I never declared a major                                             128
Fine arts or performing arts                                          87
Which best describes your undergraduate major? - Se

In [16]:
df['Q26'].value_counts()

Probably yes                                        4893
Definitely yes                                      4684
Maybe                                               4184
Probably not                                        3162
Definitely not                                      1557
Do you consider yourself to be a data scientist?       1
Name: Q26, dtype: int64

In [17]:
df['Q6'].value_counts()

Student                                                                                                    5253
Data Scientist                                                                                             4137
Software Engineer                                                                                          3130
Data Analyst                                                                                               1922
Other                                                                                                      1322
Research Scientist                                                                                         1189
Not employed                                                                                                842
Consultant                                                                                                  785
Business Analyst                                                                                        

In [19]:
df['Q17'].value_counts()

Python                                                                         8180
R                                                                              2046
SQL                                                                            1211
Java                                                                            903
C/C++                                                                           739
C#/.NET                                                                         432
Javascript/Typescript                                                           408
MATLAB                                                                          355
SAS/STATA                                                                       228
PHP                                                                             191
Visual Basic/VBA                                                                135
Other                                                                       

## Cleanup

In [298]:
df = df_.copy(deep=True)

In [299]:
columns = {
    'Q1': 'gender',
    'Q2': 'age',
    'Q3': 'country',
    'Q4': 'education',
    'Q5': 'major',
    'Q8': 'years_exp',
    'Q9': 'compensation',
    'Q10': 'ml_used',
    'Q16_Part_1': 'python',
    'Q16_Part_2': 'r',
    'Q16_Part_3': 'sql',    
    # 'Q17': 'pref_lang',    
    'Q6': 'target'
}

In [300]:
df = df.iloc[1:].filter(columns).rename(columns=columns).reset_index(drop=True)

In [301]:
df.head()

Unnamed: 0,gender,age,country,education,major,years_exp,compensation,ml_used,python,r,sql,target
0,Female,45-49,United States of America,Doctoral degree,Other,,,I do not know,,,,Consultant
1,Male,30-34,Indonesia,Bachelor’s degree,Engineering (non-computer focused),5-10,"10-20,000",No (we do not use ML methods),,,SQL,Other
2,Female,30-34,United States of America,Master’s degree,"Computer science (software engineering, etc.)",0-1,"0-10,000",I do not know,,R,,Data Scientist
3,Male,35-39,United States of America,Master’s degree,"Social sciences (anthropology, psychology, soc...",,,,Python,R,SQL,Not employed
4,Male,22-24,India,Master’s degree,Mathematics or statistics,0-1,"0-10,000",I do not know,,,SQL,Data Analyst


In [312]:
df.gender = df.gender.apply(str.lower)

In [302]:
df.age = df.age.apply(lambda x: x.replace("+", '').split('-')[0]).astype(int)

In [303]:
df.years_exp = df.years_exp.fillna('0').apply(lambda x: x.replace("+", '').split('-')[0]).astype(float)

In [304]:
df.compensation = df\
                    .compensation\
                    .fillna('0')\
                    .apply(lambda x: x.replace('+', '').replace(',', '').replace("500000","500").split('-')[0])\
                    .replace("I do not wish to disclose my approximate yearly compensation", 0)\
                    .astype(int)\
                    .mul(1_000)

In [305]:
ctry = [
    df.country == 'United States of America',
    df.country == 'India'
]

c_values = [
    "usa", "india"
]

df.country = np.select(ctry, c_values, 'other')

In [306]:
df.major = df.major.where(df.major.isin(df.major.value_counts().index[:3]), 'Other')

major = [
    df.major == 'Computer science (software engineering, etc.)',
    df.major == 'Engineering (non-computer focused)',
    df.major == 'Mathematics or statistics'
]

m_values = [
    "cs", "eng", "stat"
]

df.major = np.select(major, m_values, 'other')

In [307]:
edu_conditions = [
    df.education == 'Master’s degree',
    df.education == 'Bachelor’s degree',
    df.education == 'Doctoral degree',
    df.education == 'Some college/university study without earning a bachelor’s degree',
    df.education == 'Professional degree',
    df.education == 'I prefer not to answer',
    df.education == 'No formal education past high school'
]

edu_values = [
    '30',
    '20',
    '40',
    '10',
    '35',
    None,
    '5'
]

df.education = np.select(edu_conditions, edu_values, None)
df.education = df.education.fillna(0).astype(int)

In [308]:
ml_conditions = [
    df.ml_used == 'I do not know',
    df.ml_used == 'No (we do not use ML methods)',
    df.ml_used == 'We are exploring ML methods (and may one day put a model into production)',
    df.ml_used == 'We have well established ML methods (i.e., models in production for more than 2 years)',
    df.ml_used == 'We recently started using ML methods (i.e., models in production for less than 2 years)',
    df.ml_used == 'We use ML methods for generating insights (but do not put working models into production)',
]

ml_values = [
    'No',
    'No',
    'Yes',
    'Yes',
    'Yes',
    'Yes',
]

df.ml_used = np.select(ml_conditions, ml_values, 'No')

In [309]:
df.python = df.python.fillna(0).replace("Python", 1).astype(int)
df.r = df.r.fillna(0).replace("R", 1).astype(int)
df.sql = df.sql.fillna(0).replace("SQL", 1).astype(int)

In [313]:
data = df\
    .query("country.isin(['usa', 'india']) and target.isin(['Data Scientist', 'Software Engineer'])")\
    .reset_index(drop=True).copy(deep=True)

In [314]:
data

Unnamed: 0,gender,age,country,education,major,years_exp,compensation,ml_used,python,r,sql,target
0,female,30,usa,30,cs,0.0,0,No,0,1,0,Data Scientist
1,male,40,usa,30,eng,5.0,125000,Yes,1,0,1,Data Scientist
2,male,40,india,20,eng,10.0,0,Yes,1,1,0,Software Engineer
3,male,40,usa,40,cs,10.0,400000,Yes,1,0,0,Software Engineer
4,male,18,india,20,cs,0.0,0,No,1,0,0,Data Scientist
...,...,...,...,...,...,...,...,...,...,...,...,...
2738,male,18,india,30,cs,0.0,0,No,0,0,0,Software Engineer
2739,female,25,india,30,cs,3.0,0,Yes,1,0,1,Software Engineer
2740,female,25,india,20,cs,0.0,0,No,0,0,0,Software Engineer
2741,male,22,india,30,eng,1.0,0,Yes,0,0,0,Software Engineer
