In [27]:
import pandas as pd
import numpy as np

In [28]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [29]:
df=pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [30]:
# Lets suppose we want to do some analysis on this dataframe
# But if they dont have first, last, age we wanna remove this row
df.dropna()
# This will drop me the rows with NaN information.


Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [31]:
df.dropna(axis="index",how="any")
# axis="index" it will drop rows if they have nan value.
# axis="columns" it will drop the columns that have nan values.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [32]:
df.dropna(axis="index",how="all")
# this will drop the rows with all missing values
# None is counted as missing value in Pandas

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [33]:
df.dropna(axis="columns",how="all")
# We drop the columns here that all their values are missing.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [34]:
df.dropna(axis="columns",how="any")
# This will give us an empty dataframe cause we have a row with all missing value

0
1
2
3
4
5
6


##### The question now can I drop rows that contains missing value in specific column?

In [35]:
df.dropna(axis="index",how="any",subset=["email"])
# This will drop me the rows where the email col has NaN value.
# how here doesnt do that much because we want just to look in email column.
# how does much we pass multiple values into subset then any or all diverse.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [36]:
# Lets see passing multiple values 
df.dropna(axis="index",how="all",subset=["last","email"])
# We drop the row when both the last and the email NaN
# The fourth index will be removed.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


##### We notice that some missing values are filled as string to solve this problem:
- We use replace() method

In [37]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [38]:
df.replace("Missing",np.nan,inplace=True)
df.replace("NA",np.nan,inplace=True)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [39]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [40]:
# To know which value is na => df.isna()
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


##### What if I want to replce my na value to a specific value?

In [41]:
# We can use the df.fillna(value)
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


### Casting Datatypes

In [42]:
# To know which types of datatype we have for all cols:
# We can use the attribute dtypes
# object is a string or mix of a lot of things.
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [None]:
# If want the average age it would not work cause I have object data type
# This will cause me error cause I have object or string data type
# To solve this problem I want to do data types casting.
df["age"].mean()

In [43]:
type(np.nan)
# For this reason if I want to convert those to integer it will throw me an error

float

In [47]:
df["age"]=df["age"].astype(float)
df.dtypes
# Notice if we convert the age col to int it will give me an error cause this col have na

first     object
last      object
email     object
age      float64
dtype: object

In [48]:
df["age"].mean()

46.75

##### Lets do some analysis on our survey data:

##### When we have custom missing values we can easily handle whem I habe csv file:
- Firstly I create a list with all my custom missing values na_vals
- Secondly we add an arg within reading my csv file na_values=na_vals


In [54]:
na_vals=["Missing","NA"]
df= pd.read_csv("data\survey_results_public.csv",index_col="Respondent",na_values=na_vals)
df_schema= pd.read_csv("data\survey_results_schema.csv",index_col="Column")
pd.set_option("display.max_columns",85)
pd.set_option("display.max_rows",85)

  df= pd.read_csv("data\survey_results_public.csv",index_col="Respondent",na_values=na_vals)
  df_schema= pd.read_csv("data\survey_results_schema.csv",index_col="Column")


##### Lets do some data casting in my dataframe:

In [56]:
df.dtypes.head(85)

MainBranch                 object
Hobbyist                   object
OpenSourcer                object
OpenSource                 object
Employment                 object
Country                    object
Student                    object
EdLevel                    object
UndergradMajor             object
EduOther                   object
OrgSize                    object
DevType                    object
YearsCode                  object
Age1stCode                 object
YearsCodePro               object
CareerSat                  object
JobSat                     object
MgrIdiot                   object
MgrMoney                   object
MgrWant                    object
JobSeek                    object
LastHireDate               object
LastInt                    object
FizzBuzz                   object
JobFactors                 object
ResumeUpdate               object
CurrencySymbol             object
CurrencyDesc               object
CompTotal                 float64
CompFreq      

In [58]:
# Lets calculate the average how many years of each developer
# The column called YearsCode
df["YearsCode"].head(10)

Respondent
1       4
2     NaN
3       3
4       3
5      16
6      13
7       6
8       8
9      12
10     12
Name: YearsCode, dtype: object

In [None]:
df["YearsCode"].head(10)
# If we run it now we will get an error
# So we have to do casting before doing any calculation

In [None]:
df["YearsCode"]=df["YearsCode"].astype(float)
# This will cause me an error as well cause we have a  string that cannot be converted
# This string called "Less than 1 year"

In [63]:
df["YearsCode"].value_counts().head(50)

YearsCode
5                     7047
10                    6777
6                     6179
4                     5729
8                     5361
7                     5320
3                     5179
2                     3974
15                    3942
20                    3636
12                    3530
9                     3360
11                    2265
14                    2126
13                    2036
18                    1900
1                     1814
25                    1657
16                    1593
30                    1532
Less than 1 year      1367
17                    1349
19                    1018
22                    1016
35                     873
23                     745
21                     715
24                     693
40                     497
28                     465
32                     420
26                     409
27                     408
33                     353
38                     340
34                     327
37                

In [66]:
# Or we can use a method that called unique():
df["YearsCode"].unique()
# This will show the unique values of that column.
# So we have noticed using this method we have two strings that cannot be converted.

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [70]:
# Lets replace "Less than 1 year" with 0
# Lets replace "More than 50 year" with 51
df["YearsCode"].replace("Less than 1 year",0, inplace=True)
df["YearsCode"].replace("More than 50 years",51, inplace=True)

In [71]:
df["YearsCode"].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 0, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37', 51, '29',
       '44', '45', '48', '46', '43', '47', '49'], dtype=object)

In [72]:
# Now we can covert the type:
df["YearsCode"]=df["YearsCode"].astype(float)

In [73]:
df["YearsCode"].mean()

11.662114216834588

In [74]:
df["YearsCode"].median()

9.0