## 9.10 - Preparando os dados

#### Alguns métodos para DataFrames


| Métodos aplicados ao DataFrame | Descrição |
| :-- | :-- |
| .drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise') | Remove uma série de dados especificada |
| .isnull(obj)/.notnull(obj) | cria uma série de booleanos |
| .dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) | Deleta linha (axis=0) ou coluna (axis=1) com célula(s) nula(s) |
| .fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None) | Substitui o valor nulo por um valor determinado |
| .duplicated(subset=None, keep='first') | retorna um booleano com valores duplicados |
| .drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False) | deleta linhas com valores duplicados. Pode selecionar uma determinada coluna usando subset |


Atenção, esses métodos não modificam o DataFrame

In [2]:
import numpy as np
import pandas as pd

a_arr = np.arange(20).reshape(4,5)
a_arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [4]:
df_columns = ['A','B','C','D','E']
a_df = pd.DataFrame(a_arr,columns=df_columns)
a_df

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [5]:
a_df.drop(index=2)

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
3,15,16,17,18,19


In [7]:
a_df.drop(columns='C')

Unnamed: 0,A,B,D,E
0,0,1,3,4
1,5,6,8,9
2,10,11,13,14
3,15,16,18,19


In [8]:
a_df.drop(index=2,columns='C')

Unnamed: 0,A,B,D,E
0,0,1,3,4
1,5,6,8,9
3,15,16,18,19


In [9]:
a_df.drop(columns='C',index=[0,2])

Unnamed: 0,A,B,D,E
1,5,6,8,9
3,15,16,18,19


In [12]:
a_df.drop(columns='C',index=[0,2])

Unnamed: 0,A,B,D,E
1,5,6,8,9
3,15,16,18,19


In [15]:
b_arr = np.vstack([a_arr, np.array([np.nan,np.nan,np.nan,np.nan,np.nan])])
b_arr

array([[ 0.,  1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.],
       [10., 11., 12., 13., 14.],
       [15., 16., 17., 18., 19.],
       [nan, nan, nan, nan, nan]])

In [18]:
b_df = pd.DataFrame(b_arr,columns=df_columns)
b_df

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,,,,,


In [19]:
b_df.isnull()

Unnamed: 0,A,B,C,D,E
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,True,True,True,True,True


In [20]:
b_df.notnull()

Unnamed: 0,A,B,C,D,E
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,False,False,False,False,False


In [24]:
b_df[b_df['A'].notnull()]

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [27]:
c_arr = np.vstack([a_arr, np.array([np.nan,np.nan,np.nan,np.nan,np.nan])])
c_df = pd.DataFrame(c_arr,columns=df_columns)
c_df

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,,,,,


In [28]:
c_df.dropna()

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [None]:
c_arr = np.vstack([a_arr, np.array([1,np.nan,3,np.nan,np.nan])])
c_df = pd.DataFrame(c_arr,columns=df_columns)
c_df.dropna(how='all')

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,1.0,,3.0,,


In [37]:
c_arr = np.vstack([a_arr, np.array([1,np.nan,np.nan,np.nan,5])])
c_df = pd.DataFrame(c_arr,columns=df_columns)
c_df.dropna(axis=1,how='any')

Unnamed: 0,A,E
0,0.0,4.0
1,5.0,9.0
2,10.0,14.0
3,15.0,19.0
4,1.0,5.0


In [40]:
c_df.fillna(value=99)

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,1.0,99.0,99.0,99.0,5.0


In [44]:
d_arr = np.vstack([a_arr, np.array([15,16,17,18,19])])
d_df = pd.DataFrame(d_arr,columns=df_columns)
d_df

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,15,16,17,18,19


In [47]:
d_df.duplicated()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [54]:
d_df.duplicated(keep='last')

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [53]:
d_df.duplicated(keep=False)

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [64]:
d_df.drop_duplicates()

Unnamed: 0,A,B,C,D,E
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [60]:
e_arr = np.vstack([a_arr, np.array([15,np.nan,np.nan,np.nan,np.nan])])
e_df = pd.DataFrame(e_arr,columns=df_columns)
print(e_df)
e_df.duplicated(subset='A')

      A     B     C     D     E
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
4  15.0   NaN   NaN   NaN   NaN


0    False
1    False
2    False
3    False
4     True
dtype: bool