# Criando e Manipulando Dataframes

## Criando um DataFrame

In [1]:
# Importando as bibliotecas necessarias
import pandas as pd
import numpy as np

In [2]:
# Criando uma Serie de Dados
series = pd.Series([17, 8, 4, np.nan, 12, 18 ])
series

0    17.0
1     8.0
2     4.0
3     NaN
4    12.0
5    18.0
dtype: float64

In [3]:
# Tipo da Serie
type(series)

pandas.core.series.Series

In [4]:
# Criando Datas
data = pd.date_range('20220101', periods=12)
data

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12'],
              dtype='datetime64[ns]', freq='D')

In [5]:
# Criando Datas 2
data = pd.date_range('20220101', periods=250, freq='D')
data

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10',
               ...
               '2022-08-29', '2022-08-30', '2022-08-31', '2022-09-01',
               '2022-09-02', '2022-09-03', '2022-09-04', '2022-09-05',
               '2022-09-06', '2022-09-07'],
              dtype='datetime64[ns]', length=250, freq='D')

In [6]:
# Tipo de Datas
type(data)

pandas.core.indexes.datetimes.DatetimeIndex

In [7]:
# Criando um DataFrame
df = pd.DataFrame(np.random.randn(250, 8), index=data, columns=list('ABCDEFGH'))
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H
2022-01-01,-1.288102,-0.605454,1.532895,1.180629,-0.313371,-2.609133,-0.442819,-1.62365
2022-01-02,-0.774717,0.857642,0.530612,-0.65706,-0.325171,0.944283,-0.414413,1.573437
2022-01-03,-0.996009,-1.536338,0.385478,2.205813,0.41915,1.289886,0.976211,-1.070623
2022-01-04,0.948573,3.096092,-0.530087,-0.718402,2.597795,-0.45538,-2.441391,-1.708683
2022-01-05,1.025555,-0.115709,0.248677,0.490034,1.00815,0.39322,-0.010303,0.544392


In [8]:
# Visualizando as Colunas
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], dtype='object')

In [9]:
# Tipo do df
type(df)

pandas.core.frame.DataFrame

In [10]:
# Gerando Valores Aleatorios
np.random.randn(12, 6)

array([[ 0.04530778,  0.08829919,  1.54627586, -0.09446522,  0.56526782,
        -0.04008145],
       [ 0.28319835, -0.26593094, -1.12828171, -1.15261168, -1.13639256,
         0.83857993],
       [-0.18450691,  0.29611503, -0.41147263,  0.20834327, -0.85043381,
         0.31863493],
       [-0.91668575,  0.63975628, -1.12780655, -1.46629283,  0.37020958,
        -0.25253914],
       [ 2.63785542,  1.31560986,  0.6218784 ,  0.69395819,  0.16700915,
         0.05132215],
       [ 1.01672503,  0.83704971,  2.24962487, -0.3510711 ,  1.14541995,
        -1.67474662],
       [ 0.46564518,  0.6274766 , -0.96705093, -0.43693009,  0.29381191,
        -0.84298762],
       [-1.084346  , -0.43434336,  0.92336016, -0.23686482,  0.25450273,
        -1.16703912],
       [-0.69274255, -0.53218453,  0.69588506, -0.59543556,  0.71124299,
         0.08438548],
       [ 0.73542721, -0.23201488, -0.45763765,  1.01240535,  0.01365082,
        -0.00303484],
       [-0.76732236,  1.24280198,  1.25452545,  0.

In [11]:
# Outra Maneira de Criar um DataFrame com Valores Aleatorios

df2 = pd.DataFrame({"A": 7,
                    "B": pd.Timestamp('20190101'),
                    "C": pd.Series(1, index=list(range(4)), dtype='float32'),
                    "D": np.array([3] * 4, dtype='int32'),
                    "E": pd.Categorical(['test', 'train', 'test', 'train']),
                    "F": 'Python'})
df2

Unnamed: 0,A,B,C,D,E,F
0,7,2019-01-01,1.0,3,test,Python
1,7,2019-01-01,1.0,3,train,Python
2,7,2019-01-01,1.0,3,test,Python
3,7,2019-01-01,1.0,3,train,Python


### Adicionando Colunas ao DataFrame df

In [12]:
# Criando o DataFrame
datas = pd.date_range('20220101', periods=300, freq='D')
df = pd.DataFrame(np.random.randn(300, 10), index=datas, columns = list('ABCDEFGHIJ'))
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2022-01-01,-1.414894,-0.153939,-0.208304,0.356156,0.428807,2.27967,1.305456,-0.380123,-1.364331,0.656336
2022-01-02,-0.697939,-0.153702,1.683182,2.061319,-0.864732,-1.621414,-0.090424,1.90527,-0.321073,-0.250649
2022-01-03,1.893449,-1.234055,1.880055,-0.509722,0.393312,1.228265,0.573248,0.883835,-0.58433,0.483971
2022-01-04,-0.400636,-0.611024,-1.519322,-1.680901,0.535945,-1.337839,-0.502828,2.466184,-0.200961,-0.000624
2022-01-05,0.33149,0.173565,-1.565295,0.179646,-0.358064,-0.118923,1.596337,-1.461874,0.832708,1.303989


In [13]:
# Shape Linhas e Colunas do Dataframe
df.shape

(300, 10)

In [14]:
# Adicionando uma Nova Coluna com valores de 0 a 300
df['K'] = range(300)
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K
2022-01-01,-1.414894,-0.153939,-0.208304,0.356156,0.428807,2.27967,1.305456,-0.380123,-1.364331,0.656336,0
2022-01-02,-0.697939,-0.153702,1.683182,2.061319,-0.864732,-1.621414,-0.090424,1.90527,-0.321073,-0.250649,1
2022-01-03,1.893449,-1.234055,1.880055,-0.509722,0.393312,1.228265,0.573248,0.883835,-0.58433,0.483971,2
2022-01-04,-0.400636,-0.611024,-1.519322,-1.680901,0.535945,-1.337839,-0.502828,2.466184,-0.200961,-0.000624,3
2022-01-05,0.33149,0.173565,-1.565295,0.179646,-0.358064,-0.118923,1.596337,-1.461874,0.832708,1.303989,4


In [15]:
# Criando uma Coluna com o Nome produto
df['Produto'] = (df['A'] + df['B']) * df['D']
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,Produto
2022-01-01,-1.414894,-0.153939,-0.208304,0.356156,0.428807,2.27967,1.305456,-0.380123,-1.364331,0.656336,0,-0.558749
2022-01-02,-0.697939,-0.153702,1.683182,2.061319,-0.864732,-1.621414,-0.090424,1.90527,-0.321073,-0.250649,1,-1.755504
2022-01-03,1.893449,-1.234055,1.880055,-0.509722,0.393312,1.228265,0.573248,0.883835,-0.58433,0.483971,2,-0.336108
2022-01-04,-0.400636,-0.611024,-1.519322,-1.680901,0.535945,-1.337839,-0.502828,2.466184,-0.200961,-0.000624,3,1.700502
2022-01-05,0.33149,0.173565,-1.565295,0.179646,-0.358064,-0.118923,1.596337,-1.461874,0.832708,1.303989,4,0.090731


### Visualizando Dados

In [16]:
# 5 Primeiras Linhas
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,Produto
2022-01-01,-1.414894,-0.153939,-0.208304,0.356156,0.428807,2.27967,1.305456,-0.380123,-1.364331,0.656336,0,-0.558749
2022-01-02,-0.697939,-0.153702,1.683182,2.061319,-0.864732,-1.621414,-0.090424,1.90527,-0.321073,-0.250649,1,-1.755504
2022-01-03,1.893449,-1.234055,1.880055,-0.509722,0.393312,1.228265,0.573248,0.883835,-0.58433,0.483971,2,-0.336108
2022-01-04,-0.400636,-0.611024,-1.519322,-1.680901,0.535945,-1.337839,-0.502828,2.466184,-0.200961,-0.000624,3,1.700502
2022-01-05,0.33149,0.173565,-1.565295,0.179646,-0.358064,-0.118923,1.596337,-1.461874,0.832708,1.303989,4,0.090731


In [17]:
# 5 Ultimas Linhas
df.tail()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,Produto
2022-10-23,0.621972,-1.04148,-0.299891,1.151728,1.284171,-0.768093,1.244783,1.063403,0.359874,0.771688,295,-0.483159
2022-10-24,-0.731002,-1.052399,-0.948555,0.522176,0.16755,0.688108,-0.543695,-0.507368,0.632448,0.392022,296,-0.93125
2022-10-25,-0.045475,-0.761251,-0.5831,0.241638,-0.80869,2.323649,2.201525,-0.372758,-1.039367,-1.687838,297,-0.194936
2022-10-26,1.000453,0.412166,-1.271135,-0.277727,-0.041235,0.098616,0.112347,0.865689,0.99767,0.09147,298,-0.392322
2022-10-27,-0.381791,1.580897,0.743602,0.165332,0.143303,-0.17608,-0.075361,-0.305197,0.304247,-0.559596,299,0.19825


In [18]:
# Amostra Aleatoria dos dados
df.sample(7)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,Produto
2022-04-18,0.895008,-1.177493,-0.126046,-0.9369,1.551867,0.854198,-0.023215,1.366953,-1.045225,0.7536,107,0.264661
2022-04-15,0.548516,0.3628,-0.782427,-0.3336,-1.148613,0.661922,-1.914314,-0.358627,0.584606,-0.092947,104,-0.304015
2022-06-27,-1.313991,-2.186226,1.199938,-0.133837,-0.053006,-0.889224,1.932333,0.002909,-0.050629,-0.534425,177,0.468458
2022-08-19,0.801072,-1.441401,0.755852,0.30654,1.076593,1.062797,-1.899416,0.34028,0.689859,0.127055,230,-0.196286
2022-07-13,1.926697,0.695364,0.01475,-1.091664,-1.694845,1.14869,-2.192704,-1.190448,1.516132,0.041488,193,-2.862408
2022-01-27,-1.659657,0.314504,1.29819,-0.064007,0.159246,-1.784705,-0.637882,-0.966039,0.818221,0.991497,26,0.086099
2022-10-20,-1.288041,0.979974,-1.149389,0.759374,-0.191317,0.669872,-0.511514,-0.673811,0.906885,0.463457,292,-0.233937


In [19]:
# Index dos dados
df.index

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10',
               ...
               '2022-10-18', '2022-10-19', '2022-10-20', '2022-10-21',
               '2022-10-22', '2022-10-23', '2022-10-24', '2022-10-25',
               '2022-10-26', '2022-10-27'],
              dtype='datetime64[ns]', length=300, freq='D')

In [20]:
# Colunas
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'Produto'], dtype='object')

In [21]:
# Transformando em Uma Matriz do Numpy
df.to_numpy

<bound method DataFrame.to_numpy of                    A         B         C         D         E         F  \
2022-01-01 -1.414894 -0.153939 -0.208304  0.356156  0.428807  2.279670   
2022-01-02 -0.697939 -0.153702  1.683182  2.061319 -0.864732 -1.621414   
2022-01-03  1.893449 -1.234055  1.880055 -0.509722  0.393312  1.228265   
2022-01-04 -0.400636 -0.611024 -1.519322 -1.680901  0.535945 -1.337839   
2022-01-05  0.331490  0.173565 -1.565295  0.179646 -0.358064 -0.118923   
...              ...       ...       ...       ...       ...       ...   
2022-10-23  0.621972 -1.041480 -0.299891  1.151728  1.284171 -0.768093   
2022-10-24 -0.731002 -1.052399 -0.948555  0.522176  0.167550  0.688108   
2022-10-25 -0.045475 -0.761251 -0.583100  0.241638 -0.808690  2.323649   
2022-10-26  1.000453  0.412166 -1.271135 -0.277727 -0.041235  0.098616   
2022-10-27 -0.381791  1.580897  0.743602  0.165332  0.143303 -0.176080   

                   G         H         I         J    K   Produto  
2022-01

In [22]:
# Obtendo o Transposta dos dados
df.T.head()

Unnamed: 0,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,2022-01-10,...,2022-10-18,2022-10-19,2022-10-20,2022-10-21,2022-10-22,2022-10-23,2022-10-24,2022-10-25,2022-10-26,2022-10-27
A,-1.414894,-0.697939,1.893449,-0.400636,0.33149,-0.309926,-0.100317,-0.905163,1.049571,-0.358475,...,0.214543,-1.533893,-1.288041,0.575102,-0.717341,0.621972,-0.731002,-0.045475,1.000453,-0.381791
B,-0.153939,-0.153702,-1.234055,-0.611024,0.173565,-0.026774,-2.064939,1.32407,1.216012,-1.168304,...,0.979814,0.953272,0.979974,1.419073,2.195141,-1.04148,-1.052399,-0.761251,0.412166,1.580897
C,-0.208304,1.683182,1.880055,-1.519322,-1.565295,0.526458,1.642005,0.052088,0.859409,-1.417691,...,-1.736158,-1.78365,-1.149389,-0.636278,0.634269,-0.299891,-0.948555,-0.5831,-1.271135,0.743602
D,0.356156,2.061319,-0.509722,-1.680901,0.179646,-0.681065,1.244724,1.345595,-0.621352,-0.632441,...,0.291301,-0.486942,0.759374,-0.388057,1.345284,1.151728,0.522176,0.241638,-0.277727,0.165332
E,0.428807,-0.864732,0.393312,0.535945,-0.358064,0.123591,-0.762418,0.624632,2.247712,0.244691,...,-0.466435,0.09251,-0.191317,0.650984,-0.498375,1.284171,0.16755,-0.80869,-0.041235,0.143303


# Combinando DataFrames

Para Isso sera criado 3 dataframes

In [23]:
# Criando o DataFrame 1
datas = pd.date_range('20200101', periods=300, freq='D')
df1 = pd.DataFrame(np.random.randn(300, 10), index=datas, columns = list('ABCDEFGHIJ'))
df1.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2020-01-01,-1.218921,0.008672,-1.219583,-0.262957,0.256465,1.184367,-1.609538,-0.379489,1.372938,-1.443251
2020-01-02,0.782475,-0.094786,-0.856439,0.59664,-1.074851,0.398698,0.680793,-1.67253,0.923871,-0.250312
2020-01-03,-2.296243,-1.19693,0.493022,0.71908,1.318135,-0.741803,-0.328477,-0.80444,1.527189,-1.512345
2020-01-04,-0.935291,-0.708775,1.115882,-1.522468,0.257834,1.016594,1.822904,-1.520263,-0.396119,-1.197587
2020-01-05,0.187001,-2.213498,0.836827,0.175345,0.116648,-0.172943,0.431202,0.862387,0.420144,-1.440161


In [24]:
# Criando o DataFrame 2
datas = pd.date_range('20210101', periods=300, freq='D')
df2 = pd.DataFrame(np.random.randn(300, 10), index=datas, columns = list('ABCDEFGHIJ'))
df2.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2021-01-01,-0.700865,0.634711,-0.196576,0.072364,1.305165,-1.185168,0.312035,0.706081,-0.154085,-0.266035
2021-01-02,0.646324,-0.691425,1.074004,1.47978,-0.549748,-0.179604,-0.404233,0.014322,0.615396,-0.858934
2021-01-03,-0.23705,0.481127,0.062169,0.828642,-0.697133,-0.496512,-0.728756,-1.2014,1.372556,-0.360558
2021-01-04,0.494788,-1.386113,-0.644169,0.442643,-2.086575,-0.891989,0.802265,0.559884,0.969044,-1.449127
2021-01-05,0.862661,-0.435525,0.844589,-0.020121,-0.67591,0.086889,0.032627,-1.635472,-1.109482,0.825556


In [25]:
# Criando o DataFrame
datas = pd.date_range('20220101', periods=300, freq='D')
df3 = pd.DataFrame(np.random.randn(300, 10), index=datas, columns = list('ABCDEFGHIJ'))
df3.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2022-01-01,1.197229,-0.65937,-1.042087,1.368818,1.719742,0.810595,1.432293,-0.656863,-0.536806,1.729994
2022-01-02,-0.523115,-0.283119,0.528647,0.484404,1.416283,0.28535,-1.108798,0.005643,0.129862,-1.034128
2022-01-03,-0.228945,0.409184,-1.527681,-0.214064,1.006307,0.564553,0.220534,-2.046643,0.930644,0.395963
2022-01-04,0.172895,-1.433666,-2.597991,-0.910847,-0.807042,0.755046,1.1792,-1.448124,-0.344511,-0.619264
2022-01-05,-1.868336,1.454835,2.634352,-0.821549,0.376833,-0.456812,-1.18198,2.426356,-0.666418,0.147993


In [26]:
# Junção dos DataFrames Concatenação
# A Forma mais correta de se fazer e

frames = pd.concat([df1, df2, df3])
frames.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2020-01-01,-1.218921,0.008672,-1.219583,-0.262957,0.256465,1.184367,-1.609538,-0.379489,1.372938,-1.443251
2020-01-02,0.782475,-0.094786,-0.856439,0.59664,-1.074851,0.398698,0.680793,-1.67253,0.923871,-0.250312
2020-01-03,-2.296243,-1.19693,0.493022,0.71908,1.318135,-0.741803,-0.328477,-0.80444,1.527189,-1.512345
2020-01-04,-0.935291,-0.708775,1.115882,-1.522468,0.257834,1.016594,1.822904,-1.520263,-0.396119,-1.197587
2020-01-05,0.187001,-2.213498,0.836827,0.175345,0.116648,-0.172943,0.431202,0.862387,0.420144,-1.440161


In [27]:
# Shape
frames.shape

(900, 10)

In [28]:
# Concatenando os Dataframes e Identificando cada pela chave keys
dfs = pd.concat([df1, df2, df3], keys=['Dados1', 'Dados2', 'Dados3'])
dfs.head()

Unnamed: 0,Unnamed: 1,A,B,C,D,E,F,G,H,I,J
Dados1,2020-01-01,-1.218921,0.008672,-1.219583,-0.262957,0.256465,1.184367,-1.609538,-0.379489,1.372938,-1.443251
Dados1,2020-01-02,0.782475,-0.094786,-0.856439,0.59664,-1.074851,0.398698,0.680793,-1.67253,0.923871,-0.250312
Dados1,2020-01-03,-2.296243,-1.19693,0.493022,0.71908,1.318135,-0.741803,-0.328477,-0.80444,1.527189,-1.512345
Dados1,2020-01-04,-0.935291,-0.708775,1.115882,-1.522468,0.257834,1.016594,1.822904,-1.520263,-0.396119,-1.197587
Dados1,2020-01-05,0.187001,-2.213498,0.836827,0.175345,0.116648,-0.172943,0.431202,0.862387,0.420144,-1.440161


In [29]:
# Usando o Loc para visualiza os valores do df2
dfs.loc['Dados2'].head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
2021-01-01,-0.700865,0.634711,-0.196576,0.072364,1.305165,-1.185168,0.312035,0.706081,-0.154085,-0.266035
2021-01-02,0.646324,-0.691425,1.074004,1.47978,-0.549748,-0.179604,-0.404233,0.014322,0.615396,-0.858934
2021-01-03,-0.23705,0.481127,0.062169,0.828642,-0.697133,-0.496512,-0.728756,-1.2014,1.372556,-0.360558
2021-01-04,0.494788,-1.386113,-0.644169,0.442643,-2.086575,-0.891989,0.802265,0.559884,0.969044,-1.449127
2021-01-05,0.862661,-0.435525,0.844589,-0.020121,-0.67591,0.086889,0.032627,-1.635472,-1.109482,0.825556


## Merge e GroupBy

Para representação do Merge, usaremos Dados Ficticios de 2 Lojas

In [30]:
# Cadastro da Loja A

cadastro_a = {'Id': ['AA2930', 'BB4563', 'CC2139', 'DE2521', 'GT3462', 'HH1158'],
            "Nome": ['Victor', 'Amanda', 'Bruna', 'Carlos', 'Ricardo', 'Maria'],
            "Idade": [20, 35, 40, 54, 30, 27],
            "CEP": ['00092-029', '11111-111', '00000-999', '22222-888', '88888-111', '77777-666']}

cadastro_a = pd.DataFrame(cadastro_a, columns=['Id', 'Nome', 'Idade', 'CEP'])
cadastro_a

Unnamed: 0,Id,Nome,Idade,CEP
0,AA2930,Victor,20,00092-029
1,BB4563,Amanda,35,11111-111
2,CC2139,Bruna,40,00000-999
3,DE2521,Carlos,54,22222-888
4,GT3462,Ricardo,30,88888-111
5,HH1158,Maria,27,77777-666


In [31]:
# Cadastro da Loja B

cadastro_b = {'Id': ['CC9999', 'EF4488', 'DD9999', 'GT3462', 'HH1158'],
            "Nome": ['Marcos', 'Patricia', 'Ericka', 'Ricardo', 'Maria'],
            "Idade": [19, 30, 22, 30, 27],
            "CEP": ['00092-029', '11111-111', '11111-888', '88888-111', '77777-666']}

cadastro_b = pd.DataFrame(cadastro_b, columns=['Id', 'Nome', 'Idade', 'CEP'])
cadastro_b

Unnamed: 0,Id,Nome,Idade,CEP
0,CC9999,Marcos,19,00092-029
1,EF4488,Patricia,30,11111-111
2,DD9999,Ericka,22,11111-888
3,GT3462,Ricardo,30,88888-111
4,HH1158,Maria,27,77777-666


In [32]:
# Registros de Compras de Todas Unidades

compras = {'Id': ['AA2930', 'EF4488', 'CC2139', 'EF4488', 'CC9999', 'AA2930', 'HH1158', 'HH1158'],
        "Data": ['2019-01-01', '2019-01-30', '2019-02-01', '2019-02-20', 
        '2019-03-15', '2019-04-25', '2019-04-30', '2019-05-22'],
        'Valor': [200, 100, 45, 150, 300, 35, 50, 450]}

compras = pd.DataFrame(compras, columns=['Id', 'Data', 'Valor'])
compras

Unnamed: 0,Id,Data,Valor
0,AA2930,2019-01-01,200
1,EF4488,2019-01-30,100
2,CC2139,2019-02-01,45
3,EF4488,2019-02-20,150
4,CC9999,2019-03-15,300
5,AA2930,2019-04-25,35
6,HH1158,2019-04-30,50
7,HH1158,2019-05-22,450


In [33]:
# Interseção da loja-a e loja-b
from IPython.display import Image
Image("dados/img.png")

FileNotFoundError: ignored

FileNotFoundError: ignored

<IPython.core.display.Image object>

## Inner Join

In [34]:
# Interseção de Id, Idade e CEP
pd.merge(cadastro_a, cadastro_b, on=['Id'], how='inner')

Unnamed: 0,Id,Nome_x,Idade_x,CEP_x,Nome_y,Idade_y,CEP_y
0,GT3462,Ricardo,30,88888-111,Ricardo,30,88888-111
1,HH1158,Maria,27,77777-666,Maria,27,77777-666


In [35]:
pd.merge(cadastro_a, cadastro_b[['Id', 'Idade', 'CEP']], on=['Id'], how='inner')

Unnamed: 0,Id,Nome,Idade_x,CEP_x,Idade_y,CEP_y
0,GT3462,Ricardo,30,88888-111,30,88888-111
1,HH1158,Maria,27,77777-666,27,77777-666


In [36]:
# Mudança do Sufixo
pd.merge(cadastro_a, cadastro_b[['Id','Idade','CEP']], on=['Id'], how='inner', suffixes=('_A', '_B'))

Unnamed: 0,Id,Nome,Idade_A,CEP_A,Idade_B,CEP_B
0,GT3462,Ricardo,30,88888-111,30,88888-111
1,HH1158,Maria,27,77777-666,27,77777-666


## Full Join

In [37]:
lojas = pd.concat([cadastro_a, cadastro_b], ignore_index=True)
lojas

Unnamed: 0,Id,Nome,Idade,CEP
0,AA2930,Victor,20,00092-029
1,BB4563,Amanda,35,11111-111
2,CC2139,Bruna,40,00000-999
3,DE2521,Carlos,54,22222-888
4,GT3462,Ricardo,30,88888-111
5,HH1158,Maria,27,77777-666
6,CC9999,Marcos,19,00092-029
7,EF4488,Patricia,30,11111-111
8,DD9999,Ericka,22,11111-888
9,GT3462,Ricardo,30,88888-111


In [38]:
# Removendo Dados Duplicados
# subset = Mostrarei por onde quero remover
clientes_unicos = lojas.drop_duplicates(subset='Id')
clientes_unicos

Unnamed: 0,Id,Nome,Idade,CEP
0,AA2930,Victor,20,00092-029
1,BB4563,Amanda,35,11111-111
2,CC2139,Bruna,40,00000-999
3,DE2521,Carlos,54,22222-888
4,GT3462,Ricardo,30,88888-111
5,HH1158,Maria,27,77777-666
6,CC9999,Marcos,19,00092-029
7,EF4488,Patricia,30,11111-111
8,DD9999,Ericka,22,11111-888


## Left Join

In [39]:
esquerda = pd.merge(cadastro_a, compras, how='left', on=['Id'])
esquerda

Unnamed: 0,Id,Nome,Idade,CEP,Data,Valor
0,AA2930,Victor,20,00092-029,2019-01-01,200.0
1,AA2930,Victor,20,00092-029,2019-04-25,35.0
2,BB4563,Amanda,35,11111-111,,
3,CC2139,Bruna,40,00000-999,2019-02-01,45.0
4,DE2521,Carlos,54,22222-888,,
5,GT3462,Ricardo,30,88888-111,,
6,HH1158,Maria,27,77777-666,2019-04-30,50.0
7,HH1158,Maria,27,77777-666,2019-05-22,450.0


## Usando o GroupBy

In [40]:
esquerda.groupby(['Id', 'Nome'])['Valor'].sum()

Id      Nome   
AA2930  Victor     235.0
BB4563  Amanda       0.0
CC2139  Bruna       45.0
DE2521  Carlos       0.0
GT3462  Ricardo      0.0
HH1158  Maria      500.0
Name: Valor, dtype: float64

## Outer Join

In [41]:
pd.merge(cadastro_a, cadastro_b, how='outer', on=['Id'], indicator=True)

Unnamed: 0,Id,Nome_x,Idade_x,CEP_x,Nome_y,Idade_y,CEP_y,_merge
0,AA2930,Victor,20.0,00092-029,,,,left_only
1,BB4563,Amanda,35.0,11111-111,,,,left_only
2,CC2139,Bruna,40.0,00000-999,,,,left_only
3,DE2521,Carlos,54.0,22222-888,,,,left_only
4,GT3462,Ricardo,30.0,88888-111,Ricardo,30.0,88888-111,both
5,HH1158,Maria,27.0,77777-666,Maria,27.0,77777-666,both
6,CC9999,,,,Marcos,19.0,00092-029,right_only
7,EF4488,,,,Patricia,30.0,11111-111,right_only
8,DD9999,,,,Ericka,22.0,11111-888,right_only


In [42]:
# Usando o GroupBy
# GroupBy - Agrupa por

df = pd.DataFrame({'A': ['verdadeiro', 'falso', 'verdadeiro', 'falso',
                        'verdadeiro', 'falso', 'verdadeiro', 'falso'],
                  'B': ['um', 'um', 'dois', 'tres', 'dois', 'dois', 'um', 'tres'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,verdadeiro,um,1.584656,0.24671
1,falso,um,-0.49208,-0.624676
2,verdadeiro,dois,-0.976236,0.823638
3,falso,tres,0.93332,-0.019727
4,verdadeiro,dois,1.026666,-0.346277
5,falso,dois,1.895035,2.33288
6,verdadeiro,um,-1.361846,1.270858
7,falso,tres,0.199744,0.649295


In [43]:
# Agrupando pela soma da coluna A
df.groupby(['A']).sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
falso,2.536019,2.337773
verdadeiro,0.27324,1.994928


In [44]:
# Agrupando pela soma da coluna B
df.groupby(['B']).sum()

Unnamed: 0_level_0,C,D
B,Unnamed: 1_level_1,Unnamed: 2_level_1
dois,1.945466,2.810241
tres,1.133064,0.629569
um,-0.269271,0.892892


In [45]:
# Agrupando Pela soma das colunas A e B

df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
falso,dois,1.895035,2.33288
falso,tres,1.133064,0.629569
falso,um,-0.49208,-0.624676
verdadeiro,dois,0.05043,0.477361
verdadeiro,um,0.222809,1.517568


### Rehaping de Dados - Reformula/Reorganiza os Dados

In [46]:
# Criando dados
datas = pd.date_range('20190101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=datas, columns=['Var_A', 'Var_B', 'Var_C', 'Var_D'])
df

Unnamed: 0,Var_A,Var_B,Var_C,Var_D
2019-01-01,0.111398,1.215392,-0.128746,-0.550928
2019-01-02,0.257291,-0.03599,0.290385,-0.462518
2019-01-03,0.757475,-0.795998,0.382636,1.175991
2019-01-04,1.669055,-1.059944,1.220916,1.261468
2019-01-05,0.425425,-0.296495,-0.596737,-1.022078
2019-01-06,0.298128,1.239022,-0.841147,-0.247939


In [47]:
# Transposta dos Dados
dft = df.T
dft

Unnamed: 0,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06
Var_A,0.111398,0.257291,0.757475,1.669055,0.425425,0.298128
Var_B,1.215392,-0.03599,-0.795998,-1.059944,-0.296495,1.239022
Var_C,-0.128746,0.290385,0.382636,1.220916,-0.596737,-0.841147
Var_D,-0.550928,-0.462518,1.175991,1.261468,-1.022078,-0.247939


In [48]:
# Dimensões dos Dados
df.shape, dft.shape

((6, 4), (4, 6))

In [49]:
# Extraindo apenas os dados numericos, valores
df.values

array([[ 0.11139761,  1.21539176, -0.12874637, -0.55092833],
       [ 0.25729144, -0.03598994,  0.29038496, -0.46251785],
       [ 0.75747467, -0.79599817,  0.38263602,  1.1759906 ],
       [ 1.66905543, -1.0599436 ,  1.22091561,  1.26146761],
       [ 0.42542454, -0.29649529, -0.59673663, -1.02207805],
       [ 0.29812804,  1.23902199, -0.84114732, -0.24793863]])

In [50]:
# Extraindo os Valores
dft.values

array([[ 0.11139761,  0.25729144,  0.75747467,  1.66905543,  0.42542454,
         0.29812804],
       [ 1.21539176, -0.03598994, -0.79599817, -1.0599436 , -0.29649529,
         1.23902199],
       [-0.12874637,  0.29038496,  0.38263602,  1.22091561, -0.59673663,
        -0.84114732],
       [-0.55092833, -0.46251785,  1.1759906 ,  1.26146761, -1.02207805,
        -0.24793863]])

In [51]:
# Reshape dos Dados
v = dft.values
v.reshape((2, 12))

array([[ 0.11139761,  0.25729144,  0.75747467,  1.66905543,  0.42542454,
         0.29812804,  1.21539176, -0.03598994, -0.79599817, -1.0599436 ,
        -0.29649529,  1.23902199],
       [-0.12874637,  0.29038496,  0.38263602,  1.22091561, -0.59673663,
        -0.84114732, -0.55092833, -0.46251785,  1.1759906 ,  1.26146761,
        -1.02207805, -0.24793863]])

In [52]:
# Função Pivot
dias = pd.date_range(start='20220101', periods=12)
dias

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12'],
              dtype='datetime64[ns]', freq='D')

In [53]:
# Criando lista
pessoa = ['George', 'Victor', 'Lucas']
pessoa

['George', 'Victor', 'Lucas']

In [54]:
# Escolha Aleatorios
np.random.choice(pessoa)

'George'

In [55]:
nome = []
gasto = []
for i in range(12):
    nome.append(np.random.choice(pessoa))
    gasto.append(np.round(np.random.rand()*150))
nome

['Lucas',
 'Lucas',
 'George',
 'Victor',
 'Lucas',
 'Victor',
 'Victor',
 'Lucas',
 'George',
 'George',
 'Victor',
 'Victor']

In [56]:
gasto

[15.0, 13.0, 49.0, 117.0, 84.0, 4.0, 37.0, 18.0, 126.0, 112.0, 100.0, 77.0]

In [57]:
# Criando o DataFrame
df = pd.DataFrame({'Dia': dias, 
                   'Nome': nome, 
                   'Gasto': gasto})
df

Unnamed: 0,Dia,Nome,Gasto
0,2022-01-01,Lucas,15.0
1,2022-01-02,Lucas,13.0
2,2022-01-03,George,49.0
3,2022-01-04,Victor,117.0
4,2022-01-05,Lucas,84.0
5,2022-01-06,Victor,4.0
6,2022-01-07,Victor,37.0
7,2022-01-08,Lucas,18.0
8,2022-01-09,George,126.0
9,2022-01-10,George,112.0


In [58]:
# Função Pivot
df.pivot(index='Dia', columns='Nome', values='Gasto')

Nome,George,Lucas,Victor
Dia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,,15.0,
2022-01-02,,13.0,
2022-01-03,49.0,,
2022-01-04,,,117.0
2022-01-05,,84.0,
2022-01-06,,,4.0
2022-01-07,,,37.0
2022-01-08,,18.0,
2022-01-09,126.0,,
2022-01-10,112.0,,


In [59]:
# Função Pivot_table
carros = [7, 4, 3, 2, 8]
dias = pd.date_range('20190101', '20190101', periods=5)
vendedor = ['George', 'Vagner', 'Pedro', 'Vagner', 'George']
df = pd.DataFrame({'Vendas':carros, 'Data':dias, 'Vendedor':vendedor})
df

Unnamed: 0,Vendas,Data,Vendedor
0,7,2019-01-01,George
1,4,2019-01-01,Vagner
2,3,2019-01-01,Pedro
3,2,2019-01-01,Vagner
4,8,2019-01-01,George


In [60]:
# Usando Pivot
'''pd.pivot(df,index='Data', columns='Vendedor', values='Vendas')
'''
# Ao Usa o Pivot dara erro, pois o index possui valores duplicados

"pd.pivot(df,index='Data', columns='Vendedor', values='Vendas')\n"

In [61]:
# Usando Pivot_table
pd.pivot_table(df, index='Data', columns='Vendedor', values='Vendas')

Vendedor,George,Pedro,Vagner
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,7.5,3.0,3.0


In [62]:
# Stack e Unstack
df = pd.read_csv('dados/nba.csv')
df.head()

FileNotFoundError: ignored

In [None]:
# Usando o stack
stack_df = df.stack()
stack_df

In [None]:
# Usando o Unstack, para retorna a forma padrao dos dados
udf = stack_df.unstack()
udf.head()

In [None]:
# Usando o Melt
df = pd.DataFrame({'A': {0: 'a', 1:'b', 2:'c'},
                  'B': {0:1, 1:3, 2:5},
                  'C': {0:2, 1:4, 2:6}})
df

In [None]:
pd.melt(df, id_vars=['A'], value_vars=['B'])

In [None]:
pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])

### Filtros e Manipulações de Dados

In [None]:
# Criando os Dados
datas = pd.date_range('20180101', periods=600, freq='D')
df = pd.DataFrame(np.random.randn(600,5), index=datas, columns=list('ABCDE'))
df

In [None]:
# Obtendo dados de determinada coluna
df['D'] # Dessa forma retornaram como dataseries

In [None]:
# Obtendo dados de determinada coluna
df[['D']] # Dessa forma retornaram como DataFrame

In [None]:
# Usando o loc
df.loc[:, ['A', 'B']]

In [None]:
# Usando o iloc
df.iloc[1:, 0:3]

In [None]:
# Usando o Iloc
# Primeiro linha depois coluna
df.iloc[1:, 0:3]

In [None]:
# Usando o Iloc
# Primeiro linha depois coluna
df.iloc[[1, 5, 6], [0, 3]]

In [None]:
# Usando o Iloc
# Primeiro linha depois coluna
df.head()

In [None]:
# Selecionando valores mairoes que 0
df[df.A > 0].head()

### Limpeza e Estruturação dos Dados

In [None]:
# Criando um Novo DataFrame
datas = pd.date_range('20200101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=datas, columns=['Var_A', 'Var_B', 'Var_C', 'Var_D'])
df

In [None]:
df2 = pd.DataFrame({"A": 1, 
                    "B": pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    "D": np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'Python'})
df2

In [None]:
df.shape, df2.shape

In [None]:
df.dtypes, df2.dtypes

In [None]:
# Descrição dos Dados
df.describe()

In [None]:
# Criando um novo dataframe a partir do primeiro

df1 = df.reindex(index=datas[0:4], columns=list(df.columns) + ['Var_E'])
df1

In [None]:
# Adicionando Valores a coluna Var_E

df1.loc[datas[0]:datas[1], 'Var_E'] = 1
df1

In [None]:
df.dtypes

In [None]:
df1.describe()

In [None]:
# Dados Faltantes 
# Criando um novo DataFrame
datas = pd.date_range('20190101', periods=60, freq='D')
df = pd.DataFrame(np.random.randn(60, 10), index=datas, columns=list('ABCDEFGHIJ'))
df.head()

In [None]:
# Gerando uma logica com valores NaN
df['F'] = df.F[df.F > 0]
df['G'] = df.G[df.G > 0]
df['J'] = df.J[df.J > 0]
df.head()

In [None]:
# Visualizando as informações do df
df.info()

In [None]:
# Visualizando Dados Nulos
df.isnull().sum()

In [None]:
# Criando Copias para manipulação
df2 = df.copy()
df3 = df.copy()

In [None]:
df2.head()

In [None]:
df3.head()

In [None]:
# O Metodo Fillna Preenche os valores faltantes
df2.F.fillna(np.mean(df2.F))
df2.head()

In [None]:
# Limpeza
df2 = pd.DataFrame({"A": 1, 
                    "B": pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    "D": np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'Python',
                    'G': [2,2,4,4],
                    'H': [np.nan,2,4,np.nan]})
df2

In [None]:
# Verificando se Existe dados repetidos
df2.nunique()

In [None]:
df2.nunique(axis=1, dropna=False)
df2

In [None]:
# Removera os dados duplicados
df2.drop_duplicates(subset='G', keep='last')

In [None]:
# Ordenação dos Dados
df.sort_values(by='A', ascending=True).head()

In [None]:
# Ordenando da coluna b
df.sort_values(by='B', ascending=False).head()

### Importando e Exportando

In [None]:
# Importando as Bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [None]:
# Importando os Dados
df = pd.read_csv('dados/iris.csv')
df.head()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Stilos de Graficos
plt.style.available

In [None]:
# Especificando qual stilo de grafico quero utiliza
plt.style.use('seaborn-paper')

In [None]:
df = pd.read_csv('dados/train.csv')
df.head()

In [None]:
plt.plot(df.Age, '-')