## Data Wrangling: Limpeza e estruturação

- Big Data: 4 V's -> Volume, Velocidade, Variedade, Veracidade
- 80% do trabalho

Entenda bem os dados

 - Saber o objetivo
 - Entra lixo, sai lixo
 - Transformar os dados em estruturas (listas, tuplas, dicionários, vetores, matrizes)
 - Salvar em arquivos
 - Separar em diretórios

In [1]:
import pandas as pd
import numpy as np

### Sumarizando dados

In [2]:
datas = pd.date_range('20200101', periods = 6)
df = pd.DataFrame(np.random.randn(6,4), index=datas, columns=['Var_A','Var_B','Var_C','Var_D'])
df

Unnamed: 0,Var_A,Var_B,Var_C,Var_D
2020-01-01,-1.037504,-0.590256,0.934352,0.189626
2020-01-02,0.288811,-1.243527,-0.857967,-0.035165
2020-01-03,0.554605,-0.596916,0.177961,0.942604
2020-01-04,1.728995,0.667706,-0.518769,-1.428309
2020-01-05,-0.984008,-0.605627,0.08411,0.135754
2020-01-06,0.66846,-1.568347,0.791204,-0.683256


In [3]:
df2 = pd.DataFrame({'A': 1.,
                    'B':pd.Timestamp('20130102'),
                    'C':pd.Series(1,index=list(range(4)), dtype='float32'),
                    'D':np.array([3]*4,dtype='int32'),
                    'E':pd.Categorical(['test','train','test','train']),
                    'F':'Python'
                   })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,Python
1,1.0,2013-01-02,1.0,3,train,Python
2,1.0,2013-01-02,1.0,3,test,Python
3,1.0,2013-01-02,1.0,3,train,Python


In [6]:
df.shape

(6, 4)

In [9]:
df.dtypes

Var_A    float64
Var_B    float64
Var_C    float64
Var_D    float64
dtype: object

In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [14]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [15]:
df1 = df.reindex(index=datas[:4], columns=list(df.columns)+['Var_E'])
df1

Unnamed: 0,Var_A,Var_B,Var_C,Var_D,Var_E
2020-01-01,-1.037504,-0.590256,0.934352,0.189626,
2020-01-02,0.288811,-1.243527,-0.857967,-0.035165,
2020-01-03,0.554605,-0.596916,0.177961,0.942604,
2020-01-04,1.728995,0.667706,-0.518769,-1.428309,


In [26]:
df1.loc[datas[0]:datas[1],'Var_E'] = [77,77]
df1

Unnamed: 0,Var_A,Var_B,Var_C,Var_D,Var_E
2020-01-01,-1.037504,-0.590256,0.934352,0.189626,77.0
2020-01-02,0.288811,-1.243527,-0.857967,-0.035165,77.0
2020-01-03,0.554605,-0.596916,0.177961,0.942604,
2020-01-04,1.728995,0.667706,-0.518769,-1.428309,


In [27]:
df1.dtypes

Var_A    float64
Var_B    float64
Var_C    float64
Var_D    float64
Var_E    float64
dtype: object

In [28]:
df1.describe()

Unnamed: 0,Var_A,Var_B,Var_C,Var_D,Var_E
count,4.0,4.0,4.0,4.0,2.0
mean,0.383727,-0.440748,-0.066106,-0.082811,77.0
std,1.135466,0.799972,0.794236,0.989669,0.0
min,-1.037504,-1.243527,-0.857967,-1.428309,77.0
25%,-0.042768,-0.758569,-0.603568,-0.383451,77.0
50%,0.421708,-0.593586,-0.170404,0.077231,77.0
75%,0.848203,-0.275765,0.367059,0.377871,77.0
max,1.728995,0.667706,0.934352,0.942604,77.0


### Missing values

In [32]:
datas = pd.date_range('20200101', periods = 60, freq='D')
df = pd.DataFrame(np.random.randn(60,5), index=datas, columns=list('ABCDE'))
df.head()

Unnamed: 0,A,B,C,D,E
2020-01-01,1.709261,-1.194166,0.35524,-1.087329,-1.384278
2020-01-02,1.391887,0.192838,0.628536,0.02043,-0.98372
2020-01-03,-0.070004,0.412436,-0.464181,-0.707202,1.033826
2020-01-04,0.068984,-0.666654,1.179099,-0.361858,-1.554808
2020-01-05,0.712543,-0.154484,-0.301737,1.404807,0.100812


In [35]:
df['F'] = df.A[df.A>0]
df.head(10)

Unnamed: 0,A,B,C,D,E,F
2020-01-01,1.709261,-1.194166,0.35524,-1.087329,-1.384278,1.709261
2020-01-02,1.391887,0.192838,0.628536,0.02043,-0.98372,1.391887
2020-01-03,-0.070004,0.412436,-0.464181,-0.707202,1.033826,
2020-01-04,0.068984,-0.666654,1.179099,-0.361858,-1.554808,0.068984
2020-01-05,0.712543,-0.154484,-0.301737,1.404807,0.100812,0.712543
2020-01-06,-1.497392,-0.817076,-0.608515,-0.096152,-0.334847,
2020-01-07,0.444002,0.724324,-0.361807,-0.5715,-0.190617,0.444002
2020-01-08,0.694439,0.629403,0.551028,-0.132003,-1.757395,0.694439
2020-01-09,1.233563,0.607915,0.4454,0.616988,-0.98883,1.233563
2020-01-10,1.41833,-1.106241,0.562153,-0.794657,-0.58666,1.41833


In [37]:
df2 = df.copy()
df3 = df.copy()

In [38]:
df2.dropna() # remove a linha inteira com algum valor NA

Unnamed: 0,A,B,C,D,E,F
2020-01-01,1.709261,-1.194166,0.35524,-1.087329,-1.384278,1.709261
2020-01-02,1.391887,0.192838,0.628536,0.02043,-0.98372,1.391887
2020-01-04,0.068984,-0.666654,1.179099,-0.361858,-1.554808,0.068984
2020-01-05,0.712543,-0.154484,-0.301737,1.404807,0.100812,0.712543
2020-01-07,0.444002,0.724324,-0.361807,-0.5715,-0.190617,0.444002
2020-01-08,0.694439,0.629403,0.551028,-0.132003,-1.757395,0.694439
2020-01-09,1.233563,0.607915,0.4454,0.616988,-0.98883,1.233563
2020-01-10,1.41833,-1.106241,0.562153,-0.794657,-0.58666,1.41833
2020-01-12,0.798607,-2.510363,0.076534,0.678938,-1.313956,0.798607
2020-01-13,0.148942,-0.893335,0.236449,-1.077324,0.310834,0.148942


In [42]:
df2.dropna().shape

(36, 6)

In [43]:
df3.head()

Unnamed: 0,A,B,C,D,E,F
2020-01-01,1.709261,-1.194166,0.35524,-1.087329,-1.384278,1.709261
2020-01-02,1.391887,0.192838,0.628536,0.02043,-0.98372,1.391887
2020-01-03,-0.070004,0.412436,-0.464181,-0.707202,1.033826,
2020-01-04,0.068984,-0.666654,1.179099,-0.361858,-1.554808,0.068984
2020-01-05,0.712543,-0.154484,-0.301737,1.404807,0.100812,0.712543


In [48]:
df3.F.fillna(np.mean(df3.A)).head()

2020-01-01    1.709261
2020-01-02    1.391887
2020-01-03    0.062779
2020-01-04    0.068984
2020-01-05    0.712543
Freq: D, Name: F, dtype: float64

In [52]:
df4=df.copy()
df4.F.fillna(77777)

2020-01-01        1.709261
2020-01-02        1.391887
2020-01-03    77777.000000
2020-01-04        0.068984
2020-01-05        0.712543
2020-01-06    77777.000000
2020-01-07        0.444002
2020-01-08        0.694439
2020-01-09        1.233563
2020-01-10        1.418330
2020-01-11    77777.000000
2020-01-12        0.798607
2020-01-13        0.148942
2020-01-14    77777.000000
2020-01-15    77777.000000
2020-01-16        0.497845
2020-01-17    77777.000000
2020-01-18        0.603806
2020-01-19    77777.000000
2020-01-20        1.884377
2020-01-21    77777.000000
2020-01-22        2.176623
2020-01-23        0.852652
2020-01-24    77777.000000
2020-01-25    77777.000000
2020-01-26        0.446338
2020-01-27        0.405992
2020-01-28        0.023176
2020-01-29    77777.000000
2020-01-30        0.088470
2020-01-31        0.082965
2020-02-01    77777.000000
2020-02-02    77777.000000
2020-02-03        1.801402
2020-02-04    77777.000000
2020-02-05        0.678659
2020-02-06    77777.000000
2

## Dados únicos

In [4]:
df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20130102'),
                    'C':pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D':np.array([3]*4,dtype='int32'),
                    'E':pd.Categorical(['test','train','test','train']),
                    'F':'Python',
                    'G':[2,2,4,4],
                    'H':[np.nan,2,4,np.nan]
                   })
df2

Unnamed: 0,A,B,C,D,E,F,G,H
0,1.0,2013-01-02,1.0,3,test,Python,2,
1,1.0,2013-01-02,1.0,3,train,Python,2,2.0
2,1.0,2013-01-02,1.0,3,test,Python,4,4.0
3,1.0,2013-01-02,1.0,3,train,Python,4,


In [16]:
df2.nunique(axis=1,dropna=False)

0    7
1    6
2    6
3    7
dtype: int64

### Removendo duplicatas

In [17]:
df2

Unnamed: 0,A,B,C,D,E,F,G,H
0,1.0,2013-01-02,1.0,3,test,Python,2,
1,1.0,2013-01-02,1.0,3,train,Python,2,2.0
2,1.0,2013-01-02,1.0,3,test,Python,4,4.0
3,1.0,2013-01-02,1.0,3,train,Python,4,


In [25]:
df2.drop_duplicates(subset='G', keep = 'last')

Unnamed: 0,A,B,C,D,E,F,G,H
1,1.0,2013-01-02,1.0,3,train,Python,2,2.0
3,1.0,2013-01-02,1.0,3,train,Python,4,


### Ordenação dos dados

In [27]:
df = pd.DataFrame({'Col1': ['A','A','B',np.nan,'D','C'],
                    'Col2': [2,1,9,8,7,4],
                    'Col3': [0,1,9,4,2,3]})

In [28]:
df

Unnamed: 0,Col1,Col2,Col3
0,A,2,0
1,A,1,1
2,B,9,9
3,,8,4
4,D,7,2
5,C,4,3


In [39]:
df.sort_values(by=['Col1','Col2'],ignore_index=True)

Unnamed: 0,Col1,Col2,Col3
0,A,1,1
1,A,2,0
2,B,9,9
3,C,4,3
4,D,7,2
5,,8,4
