### Read the dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline

In [12]:
schema = pd.read_csv('./survey_results_schema.csv')

In [2]:
results = pd.read_csv("survey_results_public.csv")

In [11]:
results.head()

Unnamed: 0,Respondent,Professional,ProgramHobby,Country,University,EmploymentStatus,FormalEducation,MajorUndergrad,HomeRemote,CompanySize,...,StackOverflowMakeMoney,Gender,HighestEducationParents,Race,SurveyLong,QuestionsInteresting,QuestionsConfusing,InterestedAnswers,Salary,ExpectedSalary
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,...,Strongly disagree,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,
1,2,Student,"Yes, both",United Kingdom,"Yes, full-time",Employed part-time,Some college/university study without earning ...,Computer science or software engineering,"More than half, but not all, the time",20 to 99 employees,...,Strongly disagree,Male,A master's degree,White or of European descent,Somewhat agree,Somewhat agree,Disagree,Strongly agree,,37500.0
2,3,Professional developer,"Yes, both",United Kingdom,No,Employed full-time,Bachelor's degree,Computer science or software engineering,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A professional degree,White or of European descent,Somewhat agree,Agree,Disagree,Agree,113750.0,
3,4,Professional non-developer who sometimes write...,"Yes, both",United States,No,Employed full-time,Doctoral degree,A non-computer-focused engineering discipline,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A doctoral degree,White or of European descent,Agree,Agree,Somewhat agree,Strongly agree,,
4,5,Professional developer,"Yes, I program as a hobby",Switzerland,No,Employed full-time,Master's degree,Computer science or software engineering,Never,10 to 19 employees,...,,,,,,,,,,


### Get the number of rows and columns in the dataset

In [3]:
num_rows = results.shape[0] 
num_cols = results.shape[1]
num_rows, num_cols

(51392, 154)

### Which columns have no missing values

In [4]:
# Gives whether each cell is null or not
results.isnull()
# We get one value per column. Will be non zero if any column has null values
results.isnull().mean()
# Get all columns with no missing values
results.columns[results.isnull().mean()==0]

Index(['Respondent', 'Professional', 'ProgramHobby', 'Country', 'University',
       'EmploymentStatus', 'FormalEducation'],
      dtype='object')

### Columns that have more than 75% of values missing

In [5]:
results.columns[results.isnull().mean()>0.75]

Index(['YearsCodedJobPast', 'WebDeveloperType', 'MobileDeveloperType',
       'NonDeveloperType', 'ExCoderReturn', 'ExCoderNotForMe',
       'ExCoderBalance', 'ExCoder10Years', 'ExCoderBelonged', 'ExCoderSkills',
       'ExCoderWillNotCode', 'ExCoderActive', 'TimeAfterBootcamp',
       'ExpectedSalary'],
      dtype='object')

### Proportion of individuals in each professional category

In [10]:
# Count in each profession
results.Professional.value_counts()
# Proportion in each profession
results.Professional.value_counts()/results.shape[0]

Professional developer                                  0.703047
Student                                                 0.160025
Professional non-developer who sometimes writes code    0.100016
Used to be a professional developer                     0.019127
None of these                                           0.017785
Name: Professional, dtype: float64

### Filtering data

In [14]:
schema.head()

Unnamed: 0,Column,Question
0,Respondent,Respondent ID number
1,Professional,Which of the following best describes you?
2,ProgramHobby,Do you program as a hobby or contribute to ope...
3,Country,In which country do you currently live?
4,University,"Are you currently enrolled in a formal, degree..."


In [17]:
schema[schema.Column=='Professional']

Unnamed: 0,Column,Question
1,Professional,Which of the following best describes you?


In [19]:
schema[schema.Column=='Professional']['Question'].values

array(['Which of the following best describes you?'], dtype=object)

In [20]:
schema[schema.Column=='Professional']['Question'].values[0]

'Which of the following best describes you?'

In [22]:
def get_description(column_name, schema=schema):
    '''
    INPUT - schema - pandas dataframe with the schema of the developers survey
            column_name - string - the name of the column you would like to know about
    OUTPUT - 
            desc - string - the description of the column
    '''
    desc = schema[schema.Column==column_name]['Question'].values[0]
    return desc

get_description(results.columns[0]) # This should return a string of the first column description

'Respondent ID number'

### Apply custom function

In [23]:
def higher_ed(formal_ed_str):
    '''
    INPUT
        formal_ed_str - a string of one of the values from the Formal Education column
    
    OUTPUT
        return 1 if the string is  in ("Master's degree", "Professional degree")
        return 0 otherwise
    
    '''
   
    edu = ("Master's degree", "Professional degree")
    
    if formal_ed_str in edu:
        return 1
    else:
        return 0
           
    
   

results["FormalEducation"].apply(higher_ed)[:5]

0    0
1    0
2    0
3    0
4    1
Name: FormalEducation, dtype: int64

### Grouping

In [25]:
results[['HomeRemote','JobSatisfaction']].groupby('HomeRemote')['JobSatisfaction'].mean().sort_values()

HomeRemote
Never                                                      6.632502
It's complicated                                           6.840830
More than half, but not all, the time                      6.953184
About half the time                                        6.973702
A few days each month                                      7.077509
Less than half the time, but at least one day each week    7.120125
All or almost all the time (I'm full-time remote)          7.388430
Name: JobSatisfaction, dtype: float64