## Data Filteration & Selection

In [2]:
import numpy as np
import pandas as pd

from pandas import DataFrame

In [10]:
df = pd.DataFrame(np.arange(0, 90,3).reshape(10,3), index = [f'row {i}' for i in range(1, 11)], columns = [f'C{i}' for i in range(1, 4)])
df

Unnamed: 0,C1,C2,C3
row 1,0,3,6
row 2,9,12,15
row 3,18,21,24
row 4,27,30,33
row 5,36,39,42
row 6,45,48,51
row 7,54,57,60
row 8,63,66,69
row 9,72,75,78
row 10,81,84,87


### Indexing

Indexing could be used to get values or set new values.  

**Methods of indexing:**
- **Normal**  
- **Boolean**


#### Normal  

for single value: dataframe.iloc[row, column]  

for multiple values: dataframe.iloc[[row1, row2, ..], [col1, col2, col3, ...]]

In [15]:
df.iloc[0, 2]

np.int64(6)

In [16]:
df.iloc[0, 2] = 55
df

Unnamed: 0,C1,C2,C3
row 1,0,3,55
row 2,9,12,15
row 3,18,21,24
row 4,27,30,33
row 5,36,39,42
row 6,45,48,51
row 7,54,57,60
row 8,63,66,69
row 9,72,75,78
row 10,81,84,87


In [18]:
df.iloc[[2,6,8],[0,1]]

Unnamed: 0,C1,C2
row 3,18,21
row 7,54,57
row 9,72,75


In [36]:
#whole row
df.iloc[0, :]

C1     0
C2     3
C3    55
Name: row 1, dtype: int64

In [38]:
#whole column
df.iloc[:,0]

row 1      0
row 2      9
row 3     18
row 4     27
row 5     36
row 6     45
row 7     55
row 8     63
row 9     72
row 10    81
Name: C1, dtype: int64

#### Boolean

Uses comparison operators and masking to filter the dataframe.

Using the operators, a boolean dataframe is returned which is used as a mask.



In [19]:
df>20

Unnamed: 0,C1,C2,C3
row 1,False,False,True
row 2,False,False,False
row 3,False,True,True
row 4,True,True,True
row 5,True,True,True
row 6,True,True,True
row 7,True,True,True
row 8,True,True,True
row 9,True,True,True
row 10,True,True,True


In [25]:
df[df>20]

Unnamed: 0,C1,C2,C3
row 1,,,55.0
row 2,,,
row 3,,21.0,24.0
row 4,27.0,30.0,33.0
row 5,36.0,39.0,42.0
row 6,45.0,48.0,51.0
row 7,54.0,57.0,60.0
row 8,63.0,66.0,69.0
row 9,72.0,75.0,78.0
row 10,81.0,84.0,87.0


In [31]:
df[(df>50) & (df<60)]

Unnamed: 0,C1,C2,C3
row 1,,,55.0
row 2,,,
row 3,,,
row 4,,,
row 5,,,
row 6,,,51.0
row 7,54.0,57.0,
row 8,,,
row 9,,,
row 10,,,


In [32]:
df[(df>50) & (df<60)] = 55
df

Unnamed: 0,C1,C2,C3
row 1,0,3,55
row 2,9,12,15
row 3,18,21,24
row 4,27,30,33
row 5,36,39,42
row 6,45,48,55
row 7,55,55,60
row 8,63,66,69
row 9,72,75,78
row 10,81,84,87


# Data Preparation

## Missing Values

In [59]:
data = {'Name': ['A', 'AA', 'B', 'D', 'R', 'MT', 'MA'],
        'Age': [20, 20, 19, 23, 26, 22, 19],
        'Gender': ['M', 'F', 'F', 'M', 'M', 'F', 'M'],
        'Rank': [5, 8, 7, 9, 7, 8, 7]}

data = DataFrame(data)
data

Unnamed: 0,Name,Age,Gender,Rank
0,A,20,M,5
1,AA,20,F,8
2,B,19,F,7
3,D,23,M,9
4,R,26,M,7
5,MT,22,F,8
6,MA,19,M,7


In [60]:
data.iloc[2:4,1] = np.nan
data.iloc[1:3,3] = np.nan
data.loc[7] = [np.nan, np.nan, np.nan, np.nan]

data

Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,
2,B,,F,
3,D,,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0
7,,,,


### Viewing nulls

In [61]:
#Checking for missing values

data.isnull()

Unnamed: 0,Name,Age,Gender,Rank
0,False,False,False,False
1,False,False,False,True
2,False,True,False,True
3,False,True,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,True,True,True,True


In [62]:
data.isnull().sum()

Name      1
Age       3
Gender    1
Rank      3
dtype: int64

In [63]:
# to get any row with null
data[data.isnull().any(axis=1)]

Unnamed: 0,Name,Age,Gender,Rank
1,AA,20.0,F,
2,B,,F,
3,D,,M,9.0
7,,,,


In [64]:
# to get for certain column
data[data['Age'].isnull()]

Unnamed: 0,Name,Age,Gender,Rank
2,B,,F,
3,D,,M,9.0
7,,,,


### Some useful operations:

In [65]:
data.describe()

Unnamed: 0,Age,Rank
count,5.0,5.0
mean,21.4,7.2
std,2.792848,1.48324
min,19.0,5.0
25%,20.0,7.0
50%,20.0,7.0
75%,22.0,8.0
max,26.0,9.0


For numerical features, getting values such as( mean, std, max...) helps in exploring the data and aids in deciding how to fill the NANs. 

In [None]:
data.value_counts() #returns as whole row

Name  Age   Gender  Rank
A     20.0  M       5.0     1
MA    19.0  M       7.0     1
MT    22.0  F       8.0     1
R     26.0  M       7.0     1
Name: count, dtype: int64

In [67]:
data['Gender'].value_counts()

Gender
M    4
F    3
Name: count, dtype: int64

In [68]:
data.mode()

Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,7.0
1,AA,,,
2,B,,,
3,D,,,
4,MA,,,
5,MT,,,
6,R,,,


For columns with no mode, all values will be displayed.

In [69]:
data['Age'].mode()

0    20.0
Name: Age, dtype: float64

In [70]:
for col in data.columns:
    print(col,':\n', data[col].mode())

Name :
 0     A
1    AA
2     B
3     D
4    MA
5    MT
6     R
Name: Name, dtype: object
Age :
 0    20.0
Name: Age, dtype: float64
Gender :
 0    M
Name: Gender, dtype: object
Rank :
 0    7.0
Name: Rank, dtype: float64


In [71]:
data['Gender'].unique()

array(['M', 'F', nan], dtype=object)

### Filling

In [73]:
data

Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,
2,B,,F,
3,D,,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0
7,,,,


In [79]:
data['Age'].fillna(int(data['Age'].mean()))

0    20.0
1    20.0
2    21.0
3    21.0
4    26.0
5    22.0
6    19.0
7    21.0
Name: Age, dtype: float64

In [81]:
#Notice change was not saved
data

Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,
2,B,,F,
3,D,,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0
7,,,,


In [83]:
data['Age'].fillna(int(data['Age'].mean()), inplace=True)
data

Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,
2,B,21.0,F,
3,D,21.0,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0
7,,21.0,,


In [85]:
#fills with prior value
data.fillna(method='pad')

  data.fillna(method='pad')


Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,5.0
2,B,21.0,F,5.0
3,D,21.0,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0
7,MA,21.0,M,7.0


In [87]:
#fills with next value
data.fillna(method='bfill')

  data.fillna(method='bfill')


Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,9.0
2,B,21.0,F,9.0
3,D,21.0,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0
7,,21.0,,


In [88]:
#interpolation

data.interpolate(method='linear')

  data.interpolate(method='linear')


Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,6.333333
2,B,21.0,F,7.666667
3,D,21.0,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0
7,,21.0,,7.0


In [90]:
#dropping

data.dropna()

Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
3,D,21.0,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0


In [111]:
#dropping rows with all nulls
data.iloc[7,1] = np.nan

data.dropna(how='all')

Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,
2,B,21.0,F,
3,D,21.0,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0


In [112]:
#dropping rows with more than 2 nulls

data.loc[data.isna().sum(axis=1)>2]

Unnamed: 0,Name,Age,Gender,Rank
7,,,,


In [113]:
data.drop((data.loc[data.isna().sum(axis=1)>2]).index)

Unnamed: 0,Name,Age,Gender,Rank
0,A,20.0,M,5.0
1,AA,20.0,F,
2,B,21.0,F,
3,D,21.0,M,9.0
4,R,26.0,M,7.0
5,MT,22.0,F,8.0
6,MA,19.0,M,7.0
