# Series - array unidimensional

In [9]:
import pandas as pd

In [2]:
obj = pd.Series([1,2,5,-7])

In [4]:
print(obj)

0    1
1    2
2    5
3   -7
dtype: int64


In [5]:
obj2 = pd.Series([4,7,-5,0], index = ['a','c','e','z'])

In [6]:
obj2

a    4
c    7
e   -5
z    0
dtype: int64

In [7]:
obj2['a']

4

In [9]:
obj2[obj2>2]

a    4
c    7
dtype: int64

In [10]:
obj2*2

a     8
c    14
e   -10
z     0
dtype: int64

In [11]:
'z' in obj2

True

In [12]:
sdata = (3500,500,1200,500)

In [13]:
states = ['EUA','Brasil','Argentina','Uruguai']

In [14]:
obj3 = pd.Series(sdata, index=states)

In [15]:
obj3

EUA          3500
Brasil        500
Argentina    1200
Uruguai       500
dtype: int64

In [17]:
pd.isnull(obj3) #se for nulo é True

EUA          False
Brasil       False
Argentina    False
Uruguai      False
dtype: bool

In [18]:
obj3.isnull()

EUA          False
Brasil       False
Argentina    False
Uruguai      False
dtype: bool

In [19]:
obj3.name = 'Dados por Pais'

In [20]:
obj3.index.name = 'Paises'

In [21]:
obj3

Paises
EUA          3500
Brasil        500
Argentina    1200
Uruguai       500
Name: Dados por Pais, dtype: int64

# DataFrame - Tabela de Dados

In [22]:
data = {'state': ['Brasil','EUA','Chile'],
       'year':[2012,2014,2015],
       'pop': [1225,2000,4500]}

In [23]:
df = pd.DataFrame(data)

In [24]:
df

Unnamed: 0,state,year,pop
0,Brasil,2012,1225
1,EUA,2014,2000
2,Chile,2015,4500


In [25]:
#Ordenar as colunas

In [26]:
pd.DataFrame(data,columns=['year','pop','state'])

Unnamed: 0,year,pop,state
0,2012,1225,Brasil
1,2014,2000,EUA
2,2015,4500,Chile


In [38]:
frame2 = pd.DataFrame(data,columns=['year','pop','state','rfx'],index=['a','b','c'])

In [39]:
frame2

Unnamed: 0,year,pop,state,rfx
a,2012,1225,Brasil,
b,2014,2000,EUA,
c,2015,4500,Chile,


In [40]:
frame2.columns

Index(['year', 'pop', 'state', 'rfx'], dtype='object')

In [41]:
frame2['pop']

a    1225
b    2000
c    4500
Name: pop, dtype: int64

In [42]:
frame2.loc['b']

year     2014
pop      2000
state     EUA
rfx       NaN
Name: b, dtype: object

In [43]:
val = pd.Series([-1.2,3],index=['a','c'])

In [47]:
frame2['rfx'] = val #adicionar valor ao dataframe pelo index

In [48]:
frame2

Unnamed: 0,year,pop,state,rfx
a,2012,1225,Brasil,-1.2
b,2014,2000,EUA,
c,2015,4500,Chile,3.0


In [51]:
frame2['america do sul'] = frame2.state == 'Brasil' #adiciona coluna

In [52]:
frame2

Unnamed: 0,year,pop,state,rfx,america do sul
a,2012,1225,Brasil,-1.2,True
b,2014,2000,EUA,,False
c,2015,4500,Chile,3.0,False


In [56]:
frame3 = frame2.T #transposição linhas e colunas

In [55]:
frame3

Unnamed: 0,a,b,c
year,2012,2014,2015
pop,1225,2000,4500
state,Brasil,EUA,Chile
rfx,-1.2,,3
america do sul,True,False,False


In [57]:
frame2.index.name = 'observacoes'; frame2.columns.name = 'Variáveis'

In [58]:
frame2

Variáveis,year,pop,state,rfx,america do sul
observacoes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,2012,1225,Brasil,-1.2,True
b,2014,2000,EUA,,False
c,2015,4500,Chile,3.0,False


In [66]:
frame2.values

array([[2012, 1225, 'Brasil', -1.2, True],
       [2014, 2000, 'EUA', nan, False],
       [2015, 4500, 'Chile', 3.0, False]], dtype=object)

# Objetos Index - armazena os rotulos dos eixos e outros metadados

In [83]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])

In [84]:
obj.index #index são imutáveis

Index(['a', 'b', 'c'], dtype='object')

In [85]:
obj.index[1:]

Index(['b', 'c'], dtype='object')

In [86]:
obj.index[obj.values>=1] #selecionar index por condição

Index(['b', 'c'], dtype='object')

In [87]:
labels = obj.index

In [88]:
labels

Index(['a', 'b', 'c'], dtype='object')

In [90]:
#Observar os métodos e propriedades do Index

# Funcionalidades Essenciais

# REINDEXAÇÃO - Novo objeto com dados de acordo com um novo índice

In [91]:
obj = pd.Series([4.5,3.2,5.6], index=['a','b','c'])

In [96]:
obj2 = obj.reindex(['d','b','z','a']) #d e z nao existiam no obj, por isso sáo NaN

In [100]:
obj2

d    NaN
b    3.2
z    NaN
a    4.5
dtype: float64

In [12]:
import numpy as np

In [103]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
                     index=['a','c','d'],
                     columns=['Ohio','Texas','California'])

In [104]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [105]:
states =['Texas','Utah','California']

In [106]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


# Dropagem

In [109]:
obj = pd.Series(np.arange(5.), index=['a','b','c','d','e'])

In [110]:
new_obj = obj.drop('c')

In [111]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [135]:
data = pd.DataFrame(np.arange(9).reshape((3,3)),
                     columns=['a','c','d'],
                     index=['Ohio','Texas','California'])

In [136]:
data

Unnamed: 0,a,c,d
Ohio,0,1,2
Texas,3,4,5
California,6,7,8


In [124]:
data.drop(['California','Ohio'])

Unnamed: 0,a,c,d
Texas,3,4,5


In [126]:
data.drop('c',axis=1)

Unnamed: 0,a,d
Ohio,0,2
Texas,3,5
California,6,8


In [128]:
data.drop(['a','d'],axis='columns')

Unnamed: 0,c
Ohio,1
Texas,4
California,7


In [137]:
data.drop('Texas', inplace=True) #sobrescreve o resultado do drop

In [138]:
data

Unnamed: 0,a,c,d
Ohio,0,1,2
California,6,7,8


# Indexação, seleção e filtragem

In [18]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [19]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [16]:
obj['b']

1.0

In [20]:
obj[1] #posição do valor

1.0

In [22]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [23]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [24]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [27]:
obj['b':'d'] = 5 #alteração dos valores

In [28]:
obj

a    0.0
b    5.0
c    5.0
d    5.0
dtype: float64

In [30]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])

In [31]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [32]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [33]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [34]:
data[2:]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [39]:
data['three']>5 #retorna verdadeiro ou falso

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [41]:
data[data['three']>5] #tras a tabela conforme o filtro

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [42]:
#Indexação por "loc" e "iloc" permite construir subsets, loc é por rotulos do eixo e iloc por numeros interos

In [48]:
data.loc['Colorado', ['two','three']] #linha e colunas

two      5
three    6
Name: Colorado, dtype: int32

In [51]:
 data.iloc[2, [3, 0, 1]] #é possivel determinar a ordem em que os elementos aparecem no subset

four    11
one      8
two      9
Name: Utah, dtype: int32

In [53]:
data.iloc[:,:3][data.three>5] #linha, coluna e uma condição

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


# Arimética e alinhamento de dados

In [61]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
index=['Ohio', 'Texas', 'Colorado'])

In [62]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [73]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [74]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [71]:
df1 + df2 #valores que nao sao comuns em ambos se tornam NaN

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [76]:
df1.add(df2,fill_value=0) #preenche os valores nao comuns com 0

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


#Operaçoes entre DataFrame e Series

In [77]:
arr = np.arange(12.).reshape((3, 4))

In [78]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [79]:
arr[0]

array([0., 1., 2., 3.])

In [80]:
arr[2]

array([ 8.,  9., 10., 11.])

In [90]:
arr - arr[0] # a substração é realizada elemento a elemento na linha

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [83]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [86]:
series = frame.iloc[0]

In [87]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [88]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [92]:
frame - series #subtração feita linha a linha e coluna a coluna

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [93]:
serie3 = frame['d']

In [94]:
serie3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [96]:
frame - serie3 #nao ha correspondencia para a subtração

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [101]:
frame.sub(serie3,axis='index') #subtrai cada coluna pelo serie3 com a correspondencia do index (pelo axis)

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


#Aplicação de funçoes e mapeamento

In [105]:
 frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
 index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [106]:
frame

Unnamed: 0,b,d,e
Utah,-1.150518,0.837539,-0.349835
Ohio,0.654215,-0.585187,0.754872
Texas,-0.089441,1.863711,-0.737169
Oregon,1.020107,-1.490989,-1.031817


In [107]:
np.abs(frame) #colocar em numero absoluto

Unnamed: 0,b,d,e
Utah,1.150518,0.837539,0.349835
Ohio,0.654215,0.585187,0.754872
Texas,0.089441,1.863711,0.737169
Oregon,1.020107,1.490989,1.031817


In [108]:
 f = lambda x: x.max() - x.min()

In [111]:
frame.apply(f) #metodo apply aplica uma funcao no dataframe pela coluna

b    2.170625
d    3.354700
e    1.786689
dtype: float64

In [113]:
frame.apply(f, axis='columns') #por linha

Utah      1.988058
Ohio      1.340059
Texas     2.600879
Oregon    2.511096
dtype: float64

#Ordenação e classificação

In [115]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [117]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [121]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])

In [122]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [123]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [124]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [126]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [132]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


#Indices de eixos com rotulos duplicados

In [134]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [135]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [136]:
obj['a']

a    0
a    1
dtype: int64

In [137]:
obj.index.is_unique #verifica se os index sao unicos

False

# Estatisticas Descritivas

In [141]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])

In [142]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [140]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [143]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [144]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [146]:
df.idxmax() #id (index) do valor maximo

one    b
two    d
dtype: object

In [147]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3
