In [26]:
people = {
    'first' : ['Corey', 'Jane', 'John'],
    'last' : ['Schafer', 'Doe', 'Doe'],
    'email' : ['Corey@gmail.com', 'Jane@gmail.com', 'John@gmail.com']
}

In [19]:
import pandas as pd

In [20]:
df = pd.DataFrame(people)

In [21]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,Corey@gmail.com
1,Jane,Doe,Jane@gmail.com
2,John,Doe,John@gmail.com


In [22]:
# Doing this operation returns a Series object filled with boolean values
df['last'] == 'Doe'

0    False
1     True
2     True
Name: last, dtype: bool

In [23]:
# We just added a paranthesis so that it is easier to read
filt = (df['last'] == 'Doe')

In [25]:
# We can place the filt inside the DataFrame since it will return all of the rows
df[filt]

Unnamed: 0,first,last,email
1,Jane,Doe,Jane@gmail.com
2,John,Doe,John@gmail.com


In [27]:
# We can also skip assigning the filt variable, but this will be harder to read

In [28]:
df[df['last'] == 'Doe']

Unnamed: 0,first,last,email
1,Jane,Doe,Jane@gmail.com
2,John,Doe,John@gmail.com


In [30]:
# We can also use loc. This is helpful since we can also get the specific columns that we would like.
df.loc[filt]

Unnamed: 0,first,last,email
1,Jane,Doe,Jane@gmail.com
2,John,Doe,John@gmail.com


In [33]:
# This will give you a DataFrame object.
df.loc[filt, ['first', 'email']]

Unnamed: 0,first,email
1,Jane,Jane@gmail.com
2,John,John@gmail.com


In [34]:
# This will you give a Series object.
df.loc[filt, 'email']

1    Jane@gmail.com
2    John@gmail.com
Name: email, dtype: object

We use ```&``` for the *AND* operation and ```|``` for the *OR* operation.

In [36]:
filt = (df['last'] == 'Doe') & (df['first'] == 'John')

In [38]:
df.loc[filt, 'email']

2    John@gmail.com
Name: email, dtype: object

In [40]:
filt = (df['last'] == 'Schafer') | (df['first'] == 'John')
df.loc[filt]

Unnamed: 0,first,last,email
0,Corey,Schafer,Corey@gmail.com
2,John,Doe,John@gmail.com


We also have a *NOT* filter; we use the ```~```.

In [41]:
# We are getting the results where the last name is NOT Schafer or the first name is NOT also John
df.loc[~filt]

Unnamed: 0,first,last,email
1,Jane,Doe,Jane@gmail.com


## Now let us move into the Big Survey Data

In [47]:
df = pd.read_csv('data/survey_results_public.csv', index_col = 'Respondent')
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col = 'Column')

pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [51]:
schema_df.sort_index(ascending = True)

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Age,What is your age (in years)? If you prefer not...
Age1stCode,At what age did you write your first line of c...
BetterLife,Do you think people born today will have a bet...
BlockchainIs,Blockchain / cryptocurrency technology is prim...
BlockchainOrg,How is your organization thinking about or imp...
CareerSat,"Overall, how satisfied are you with your caree..."
CodeRev,Do you review code as part of your work?
CodeRevHrs,"On average, how many hours per week do you spe..."
CompFreq,"Is that compensation weekly, monthly, or yearly?"
CompTotal,What is your current total compensation (salar...


In [53]:
df['ConvertedComp']

Respondent
1            NaN
2            NaN
3         8820.0
4        61000.0
5            NaN
          ...   
88377        NaN
88601        NaN
88802        NaN
88816        NaN
88863        NaN
Name: ConvertedComp, Length: 88883, dtype: float64

In [58]:
high_salary = (df['ConvertedComp'] > 70_000)
df.loc[high_salary, ['Country', 'LanguageWorkedWith', 'ConvertedComp']]

Unnamed: 0_level_0,Country,LanguageWorkedWith,ConvertedComp
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,Canada,Java;R;SQL,366420.0
9,New Zealand,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...,95179.0
13,United States,Bash/Shell/PowerShell;HTML/CSS;JavaScript;PHP;...,90000.0
16,United Kingdom,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;T...,455352.0
22,United States,Bash/Shell/PowerShell;C++;HTML/CSS;JavaScript;...,103000.0
...,...,...,...
88876,United States,Bash/Shell/PowerShell;C#;HTML/CSS;Java;Python;...,180000.0
88877,United States,Bash/Shell/PowerShell;C;Clojure;HTML/CSS;Java;...,2000000.0
88878,United States,HTML/CSS;JavaScript;Scala;TypeScript,130000.0
88879,Finland,Bash/Shell/PowerShell;C++;Python,82488.0


In [60]:
# What if we want differing countries?
countries = ['United States', 'India', 'United Kingdom', 'Germany', 'Canada']
filt = df['Country'].isin(countries)

In [62]:
df.loc[filt, 'Country']

Respondent
1        United Kingdom
4         United States
6                Canada
8                 India
10                India
              ...      
85642     United States
85961    United Kingdom
86012             India
88282     United States
88377            Canada
Name: Country, Length: 45008, dtype: object

In [64]:
# We can also use string methods to help in making a conditonal.
# Perhaps we only want people who only know Python, but notice that it is separated by the semicolon
# As such, we use a string method.

df['LanguageWorkedWith']

Respondent
1                          HTML/CSS;Java;JavaScript;Python
2                                      C++;HTML/CSS;Python
3                                                 HTML/CSS
4                                      C;C++;C#;Python;SQL
5              C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
                               ...                        
88377                        HTML/CSS;JavaScript;Other(s):
88601                                                  NaN
88802                                                  NaN
88816                                                  NaN
88863    Bash/Shell/PowerShell;HTML/CSS;Java;JavaScript...
Name: LanguageWorkedWith, Length: 88883, dtype: object

In [70]:
# If there exists a NaN, we need to fill that in.
filt = df['LanguageWorkedWith'].str.contains('Python', na = False)
df.loc[filt, 'LanguageWorkedWith']

Respondent
1                          HTML/CSS;Java;JavaScript;Python
2                                      C++;HTML/CSS;Python
4                                      C;C++;C#;Python;SQL
5              C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
8        Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
                               ...                        
84539    Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
85738      Bash/Shell/PowerShell;C++;Python;Ruby;Other(s):
86566      Bash/Shell/PowerShell;HTML/CSS;Python;Other(s):
87739             C;C++;HTML/CSS;JavaScript;PHP;Python;SQL
88212                           HTML/CSS;JavaScript;Python
Name: LanguageWorkedWith, Length: 36443, dtype: object

## As a summary, the filt reuturns a Series of True or False. When you do a df.loc[filt], it basically applies a mask of the dataframe.

In [72]:
filt

Respondent
1         True
2         True
3        False
4         True
5         True
         ...  
88377    False
88601    False
88802    False
88816    False
88863    False
Name: LanguageWorkedWith, Length: 88883, dtype: bool