Handling Missing Values

In [2]:
import pandas as pd
import numpy as np

In [3]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [4]:
float_data.isna()#python's None is also treated as NaN

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
float_data.notna()

0     True
1     True
2    False
3     True
dtype: bool

In [6]:
float_data.fillna(0)

0    1.2
1   -3.5
2    0.0
3    0.0
dtype: float64

Filtering Out Missing Data

In [7]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [8]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
            [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [11]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
data.dropna()#drops any row containing NaN values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
data.dropna(how="all")
#drop rows if the entire rwo contains NaN's

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
data.dropna(axis="columns")#drop column if the column contains even a single NaN value
#columns dropping

0
1
2
3


Filling In Missing Data

In [15]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [16]:
data.fillna({1: 0.5, 2: 0})
#calling fillna() with a dictionary means we can fill each column with different values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,0.0
2,,0.5,0.0
3,,6.5,3.0


Removing Duplicates

In [17]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                      "k2": [1, 1, 2, 3, 3, 4, 4]})

In [18]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [19]:
data.duplicated()
#returns a boolean series indicating whether is  duplicated or not
#5th and 6th rows are duplicates

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [20]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


 Transforming Data Using a Function or Mapping


In [21]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",                               
"pastrami", "corned beef", "bacon",
     
"pastrami", "honey ham", "nova lox"],
 "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [22]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [23]:
meat_to_animal = {
 "bacon": "pig",
 "pulled pork": "pig",
 "pastrami": "cow",
 "corned beef": "cow",
 "honey ham": "pig",
 "nova lox": "salmon"
 }

In [24]:
data["animal"] = data["food"].map(meat_to_animal)
#map() accepts a function or dict

In [25]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


Replacing Values

In [26]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [27]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [28]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [29]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [30]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

Discretization and Binning

In [31]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

In [32]:
age_categories=pd.cut(ages,bins=bins)

In [33]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [34]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [35]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [36]:
pd.cut(ages,bins,right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [37]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [38]:
pd.cut(ages,bins,labels= ["Youth", "YoungAdult", "MiddleAged", "Senior"])

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [39]:
pd.cut(ages,bins,labels= ["Youth", "YoungAdult", "MiddleAged", "Senior"]).value_counts()

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
Name: count, dtype: int64

In [40]:
data = np.random.uniform(size=20)
data,data.max(),data.min()

(array([0.66143317, 0.90626426, 0.63005968, 0.85991801, 0.68831165,
        0.56380843, 0.56636016, 0.84345366, 0.10137213, 0.23216966,
        0.26329165, 0.3180835 , 0.97775864, 0.56525769, 0.46886986,
        0.76720369, 0.84942163, 0.44772669, 0.55011078, 0.29022255]),
 0.9777586399699281,
 0.1013721290893368)

In [41]:
pd.cut(data,4).value_counts()
#4 indicates to compute 4 no.of equal bins

(0.1, 0.32]       5
(0.32, 0.54]      2
(0.54, 0.759]     7
(0.759, 0.978]    6
Name: count, dtype: int64

In [42]:
data = np.random.standard_normal(1000)
data

array([ 6.94468054e-01, -2.74234686e-01,  1.72113314e+00, -1.06132085e+00,
       -1.93517179e-01,  2.81820073e-01, -3.94587997e-01, -6.16756126e-01,
        1.67622340e+00,  8.60048384e-01,  1.56415314e-02, -3.91483767e-01,
        1.46959530e+00, -1.38588171e+00, -1.04422012e+00, -9.34403290e-01,
       -1.77867059e+00,  1.38629160e-01,  3.90975301e-01, -1.79292796e+00,
       -1.56821326e+00,  2.02872690e+00, -4.50314478e-01, -5.71672430e-01,
       -1.28062802e+00,  8.19769108e-01,  4.07291786e-01, -3.37005907e-01,
       -2.45812328e-01,  4.42142935e-01,  2.87192320e-01,  7.69149631e-01,
        2.10249126e-01, -1.30316519e+00, -2.22702158e-02, -3.02375240e-01,
        5.98614753e-01, -1.75689702e-02,  8.39515529e-01,  1.37465153e+00,
        7.74182851e-01, -1.73506460e+00, -1.62356907e+00,  5.54343277e-01,
       -1.53381227e+00, -1.57858979e+00,  8.17626257e-01,  4.35396113e-03,
        2.05468228e-01,  9.64401064e-01, -6.11055594e-02, -1.47536352e+00,
       -1.95988628e-01, -

In [43]:
pd.qcut(data, 4, precision=2)

[(0.65, 2.76], (-0.73, -0.011], (0.65, 2.76], (-3.09, -0.73], (-0.73, -0.011], ..., (-0.73, -0.011], (0.65, 2.76], (0.65, 2.76], (-0.73, -0.011], (-0.73, -0.011]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.09, -0.73] < (-0.73, -0.011] < (-0.011, 0.65] < (0.65, 2.76]]

In [44]:
pd.qcut(data, 4, precision=2).value_counts()#pandas qcut() gives us equal bins base don quartiles

(-3.09, -0.73]     250
(-0.73, -0.011]    250
(-0.011, 0.65]     250
(0.65, 2.76]       250
Name: count, dtype: int64

In [45]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],                   
"data1": range(6)})

In [46]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [48]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False
