# Pandas Series 
## Documentação: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
print(pd.__version__) 
print(np.__version__)

2.2.2
1.26.4


## Maneiras de criar uma série

### Método pandas séries

In [3]:
serie_dados = pd.Series([10,20,30,40,50]) 
print(serie_dados)

0    10
1    20
2    30
3    40
4    50
dtype: int64


In [4]:
type(serie_dados)

pandas.core.series.Series

### Listas built in python

In [5]:
array_inteiros = [10,20,30,40,50]  
indices = ['A','B','C','D','E']

series_dados = pd.Series(array_inteiros, index = indices) 
print(series_dados['A'])

10


In [6]:
np_array_inteiros = np.array([10,20,30,40,50]) 
print(np_array_inteiros)

[10 20 30 40 50]


In [7]:
print(serie_dados.shape, serie_dados.ndim, serie_dados.size)

(5,) 1 5


In [8]:
serie_dados.index = ['Z','X','Y','W','V'] 
print(serie_dados) 

Z    10
X    20
Y    30
W    40
V    50
dtype: int64


### Numpy arange e random

In [9]:
random_numbers = np.random.random(10) 
indices = np.arange(0,10)
print(indices, random_numbers)

[0 1 2 3 4 5 6 7 8 9] [0.96051199 0.07371191 0.3464587  0.01079443 0.84368455 0.51543139
 0.08864871 0.40841895 0.18539532 0.25538123]


In [10]:
serie_dados = pd.Series(random_numbers, indices) 
print(serie_dados) 

0    0.960512
1    0.073712
2    0.346459
3    0.010794
4    0.843685
5    0.515431
6    0.088649
7    0.408419
8    0.185395
9    0.255381
dtype: float64


### Dicionários

In [11]:
dicionario = {
    'ALICE': 10, 
    'BRUNA': 20, 
    'CARLA': 30, 
    'DANIELA': 40, 
    'ERICA':50
} 

serie_dados  = pd.Series(dicionario) 
print(serie_dados, serie_dados.dtype) 

ALICE      10
BRUNA      20
CARLA      30
DANIELA    40
ERICA      50
dtype: int64 int64


# Fatiamento slicing 

## Cópia, conversão e concatenção

In [12]:
serie_dados = pd.Series(random_numbers, indices) 
print(serie_dados) 

0    0.960512
1    0.073712
2    0.346459
3    0.010794
4    0.843685
5    0.515431
6    0.088649
7    0.408419
8    0.185395
9    0.255381
dtype: float64


In [13]:
serie_dados[0:3]

0    0.960512
1    0.073712
2    0.346459
dtype: float64

In [14]:
print(serie_dados[:-1],' ********  ', serie_dados[-1:])

0    0.960512
1    0.073712
2    0.346459
3    0.010794
4    0.843685
5    0.515431
6    0.088649
7    0.408419
8    0.185395
dtype: float64  ********   9    0.255381
dtype: float64


In [15]:
s2 = serie_dados[:3] 
print(s2)

0    0.960512
1    0.073712
2    0.346459
dtype: float64


## Conversão, cópia e concatenação

In [16]:
serie_dados_dois = serie_dados.copy()  #melhor maneira de se copiar um dataframe pandas 
print(serie_dados_dois)

0    0.960512
1    0.073712
2    0.346459
3    0.010794
4    0.843685
5    0.515431
6    0.088649
7    0.408419
8    0.185395
9    0.255381
dtype: float64


In [17]:
serie_dados_dois = serie_dados_dois.astype(int) #conversão de tipos de dados 
print(serie_dados_dois.dtype)

int64


In [18]:
dados_novos = {
    'GUSTAVO': 20, 
    'ALANA' : 30
} 


dict_dados = {
    'ALBERTO': 15, 
    'BRUNO': 18, 
    'CARLOS' : 20
} 
series_dados_tres = pd.Series(dados_novos) 
series_dados_dict = pd.Series(dict_dados)

serie_concatenada = pd.concat([series_dados_tres, series_dados_dict]) 
print(serie_concatenada)

GUSTAVO    20
ALANA      30
ALBERTO    15
BRUNO      18
CARLOS     20
dtype: int64


## Acesso aos dados com iloc

In [19]:
dataset = pd.read_csv('SOURCE/census.csv') 
dataset.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [21]:
age = dataset['age'] 
print(age, type(age))

0        39
1        50
2        38
3        53
4        28
         ..
32556    27
32557    40
32558    58
32559    22
32560    52
Name: age, Length: 32561, dtype: int64 <class 'pandas.core.series.Series'>


In [22]:
print(age.head(), age.tail())

0    39
1    50
2    38
3    53
4    28
Name: age, dtype: int64 32556    27
32557    40
32558    58
32559    22
32560    52
Name: age, dtype: int64


##  iloc -- acessar elementos utilizando os índices  -- Busca indexada

In [23]:
print(age.iloc[0], age.iloc[2], age.iloc[-1])  # obtendo registros únicos

39 38 52


In [24]:
print(age.iloc[0:3]) # intearvalo

0    39
1    50
2    38
Name: age, dtype: int64


In [25]:
age.iloc[[0,2,-1]] # múltiplos valores

0        39
2        38
32560    52
Name: age, dtype: int64

## loc -- acessar elementos por valor 

In [26]:
!pip install Faker



In [27]:
from faker import Faker 
fake = Faker()

In [28]:
indices_nome  = []
for _ in range(len(dataset)):
    name = fake.name()
    indices_nome.append(name)

In [29]:
dataset['NOME'] = indices_nome

In [30]:
print(dataset['NOME'].head(), dataset['NOME'].tail())

0        Cameron Paul
1    Kristina Hensley
2     Michelle Flores
3     Jennifer Burton
4      Michael Meyers
Name: NOME, dtype: object 32556       Wanda Simmons
32557    Bruce Richardson
32558         Sarah Lewis
32559     Christopher Lee
32560      Gordon Johnson
Name: NOME, dtype: object


In [31]:
dataset.loc[dataset['NOME'] == 'Anthony Sherman']

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,NOME


In [32]:
dataset.loc[dataset['NOME'] == 'Kendra Cobb']

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,NOME


In [33]:
dataset.loc[dataset['NOME'] == 'Paul Small']

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,NOME


## Ordenação 

In [34]:
dataset.sort_values('NOME', ascending=True)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,NOME
22344,26,State-gov,175044,Some-college,10,Never-married,Protective-serv,Own-child,White,Male,0,0,40,United-States,<=50K,Aaron Allen
26022,44,Private,113324,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,Aaron Allison
21966,35,Private,388252,Assoc-voc,11,Married-civ-spouse,Adm-clerical,Husband,Black,Male,0,0,40,United-States,<=50K,Aaron Anderson
3294,43,Private,220589,HS-grad,9,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,United-States,<=50K,Aaron Andrews
17876,64,?,45817,9th,5,Married-civ-spouse,?,Husband,White,Male,0,0,50,United-States,<=50K,Aaron Ball
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30668,39,Private,176279,Bachelors,13,Divorced,Sales,Not-in-family,White,Female,0,0,40,United-States,<=50K,Zoe Ball
11635,39,Private,120985,HS-grad,9,Divorced,Other-service,Own-child,White,Male,0,0,40,United-States,<=50K,Zoe Bernard
20145,55,Self-emp-not-inc,396878,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,25,United-States,<=50K,Zoe Reyes
14964,23,Private,413345,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,60,United-States,<=50K,Zoe Reynolds


## Contagem

In [35]:
dupp = dataset.duplicated()  

True in dupp.values

False

In [36]:
dataset['age'].value_counts()

age
36    898
31    888
34    886
23    877
35    876
     ... 
83      6
88      3
85      3
86      1
87      1
Name: count, Length: 73, dtype: int64

In [37]:
dataset['age'].value_counts(normalize = True, sort = False)

age
39    0.025061
50    0.018488
38    0.025398
53    0.014250
28    0.026627
        ...   
83    0.000184
84    0.000307
85    0.000092
86    0.000031
87    0.000031
Name: proportion, Length: 73, dtype: float64

In [39]:
dataset['age'].value_counts(sort = True, bins = 10 )

(38.9, 46.2]      6163
(31.6, 38.9]      6048
(24.3, 31.6]      5890
(16.926, 24.3]    5570
(46.2, 53.5]      3967
(53.5, 60.8]      2591
(60.8, 68.1]      1595
(68.1, 75.4]       496
(75.4, 82.7]       174
(82.7, 90.0]        67
Name: count, dtype: int64

## Filtros 

In [40]:
paises = [] 

for _ in range(len(dataset)): 
    paises.append(fake.country()) 

print(paises)

['French Southern Territories', 'Maldives', 'Guatemala', 'Somalia', 'Vanuatu', 'Serbia', 'Mexico', 'Cyprus', 'Cameroon', 'Antarctica (the territory South of 60 deg S)', 'Congo', 'Swaziland', 'Cocos (Keeling) Islands', 'Tonga', 'Azerbaijan', 'Chad', 'Japan', 'Maldives', 'Colombia', 'Czech Republic', 'Congo', 'Poland', 'United States Minor Outlying Islands', 'San Marino', 'Paraguay', 'Cambodia', 'Botswana', 'Ecuador', 'Comoros', 'Micronesia', 'Cook Islands', 'Belize', 'Vanuatu', 'Mozambique', 'Iran', 'Kazakhstan', 'Papua New Guinea', 'Rwanda', 'Moldova', 'Madagascar', 'Slovakia (Slovak Republic)', 'Portugal', 'Samoa', 'Saint Lucia', 'Portugal', 'Cameroon', 'Malta', 'Morocco', 'Puerto Rico', 'Jersey', 'Chile', 'Lebanon', 'Vanuatu', 'Gibraltar', 'Greenland', 'New Zealand', 'Micronesia', 'Macao', 'Liechtenstein', 'Benin', 'Iceland', 'Liberia', 'Cook Islands', 'Vanuatu', 'Georgia', 'Georgia', 'Austria', 'Armenia', 'Wallis and Futuna', 'El Salvador', 'Montserrat', 'Oman', 'Algeria', 'Guinea-B

In [41]:
dataset['paises'] = paises

In [43]:
print(dataset['paises'].head(), dataset['paises'].tail())

0    French Southern Territories
1                       Maldives
2                      Guatemala
3                        Somalia
4                        Vanuatu
Name: paises, dtype: object 32556               Brazil
32557       Western Sahara
32558             Barbados
32559               Turkey
32560    Wallis and Futuna
Name: paises, dtype: object


In [53]:
dataset.loc[(dataset['age'] > 50 ) & (dataset['paises'] == 'Brazil')]

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,NOME,paises
622,65,Private,109351,9th,5,Widowed,Priv-house-serv,Unmarried,Black,Female,0,0,24,United-States,<=50K,Brian Brown,Brazil
1441,61,?,347089,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,16,United-States,<=50K,Donald Walters,Brazil
3017,55,Federal-gov,270859,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K,Alyssa Simpson,Brazil
4970,51,?,203953,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,>50K,Jessica Vance,Brazil
6948,60,?,124487,Some-college,10,Divorced,?,Not-in-family,White,Female,0,0,40,United-States,>50K,Donald Gross,Brazil
6967,60,Private,230545,7th-8th,4,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,35,Cuba,<=50K,Michael Riley,Brazil
9303,51,Self-emp-not-inc,186845,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,8,United-States,<=50K,Mark Peterson,Brazil
10780,55,Local-gov,212448,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K,Rachel Diaz,Brazil
11137,51,Private,174754,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Female,0,0,38,United-States,<=50K,Sherri West,Brazil
11840,54,Private,138026,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,<=50K,Dr. William Butler,Brazil


## Operações matemáticas

In [64]:
dataset['age'] 
idade_mais_dois = dataset['age'] + 2 

print(dataset['age']) 
print(idade_mais_dois)

0        39
1        50
2        38
3        53
4        28
         ..
32556    27
32557    40
32558    58
32559    22
32560    52
Name: age, Length: 32561, dtype: int64
0        41
1        52
2        40
3        55
4        30
         ..
32556    29
32557    42
32558    60
32559    24
32560    54
Name: age, Length: 32561, dtype: int64


## Operações com Strings

In [73]:
dataset['paises'].str.contains('Fre')

0         True
1        False
2        False
3        False
4        False
         ...  
32556    False
32557    False
32558    False
32559    False
32560    False
Name: paises, Length: 32561, dtype: bool

In [79]:
dataset['paises'].str.split(' ')

0        [French, Southern, Territories]
1                             [Maldives]
2                            [Guatemala]
3                              [Somalia]
4                              [Vanuatu]
                      ...               
32556                           [Brazil]
32557                  [Western, Sahara]
32558                         [Barbados]
32559                           [Turkey]
32560              [Wallis, and, Futuna]
Name: paises, Length: 32561, dtype: object

In [81]:
dataset['paises'].str.split(' ', expand = True)

Unnamed: 0,0,1,2,3,4,5,6,7
0,French,Southern,Territories,,,,,
1,Maldives,,,,,,,
2,Guatemala,,,,,,,
3,Somalia,,,,,,,
4,Vanuatu,,,,,,,
...,...,...,...,...,...,...,...,...
32556,Brazil,,,,,,,
32557,Western,Sahara,,,,,,
32558,Barbados,,,,,,,
32559,Turkey,,,,,,,


In [80]:
dataset['paises'].str.split('')

0        [, F, r, e, n, c, h,  , S, o, u, t, h, e, r, n...
1                             [, M, a, l, d, i, v, e, s, ]
2                          [, G, u, a, t, e, m, a, l, a, ]
3                                [, S, o, m, a, l, i, a, ]
4                                [, V, a, n, u, a, t, u, ]
                               ...                        
32556                               [, B, r, a, z, i, l, ]
32557       [, W, e, s, t, e, r, n,  , S, a, h, a, r, a, ]
32558                         [, B, a, r, b, a, d, o, s, ]
32559                               [, T, u, r, k, e, y, ]
32560    [, W, a, l, l, i, s,  , a, n, d,  , F, u, t, u...
Name: paises, Length: 32561, dtype: object

## Agrupamentos 

In [84]:
dataset['age'].sum()

1256257

In [85]:
dataset['age'].mean()

38.58164675532078

In [86]:
dataset['age'].median()

37.0

In [88]:
dataset['age'].std()

13.640432553581146

In [87]:
dataset['age'].var()

186.06140024879625

In [103]:
dataset['age'].quantile([0.25, 0.50, 0.75])

0.25    28.0
0.50    37.0
0.75    48.0
Name: age, dtype: float64

## Agrupmento categórico

In [104]:
dataset['paises'].value_counts() 

paises
Congo               292
Korea               267
Paraguay            175
Montserrat          167
Iran                165
                   ... 
Saint Barthelemy    109
Benin               109
Bulgaria            108
Seychelles          105
Honduras            103
Name: count, Length: 243, dtype: int64

In [107]:
dataset['paises'].value_counts(normalize  = True)

paises
Congo               0.008968
Korea               0.008200
Paraguay            0.005375
Montserrat          0.005129
Iran                0.005067
                      ...   
Saint Barthelemy    0.003348
Benin               0.003348
Bulgaria            0.003317
Seychelles          0.003225
Honduras            0.003163
Name: proportion, Length: 243, dtype: float64

In [108]:
dataset['paises'].unique()

array(['French Southern Territories', 'Maldives', 'Guatemala', 'Somalia',
       'Vanuatu', 'Serbia', 'Mexico', 'Cyprus', 'Cameroon',
       'Antarctica (the territory South of 60 deg S)', 'Congo',
       'Swaziland', 'Cocos (Keeling) Islands', 'Tonga', 'Azerbaijan',
       'Chad', 'Japan', 'Colombia', 'Czech Republic', 'Poland',
       'United States Minor Outlying Islands', 'San Marino', 'Paraguay',
       'Cambodia', 'Botswana', 'Ecuador', 'Comoros', 'Micronesia',
       'Cook Islands', 'Belize', 'Mozambique', 'Iran', 'Kazakhstan',
       'Papua New Guinea', 'Rwanda', 'Moldova', 'Madagascar',
       'Slovakia (Slovak Republic)', 'Portugal', 'Samoa', 'Saint Lucia',
       'Malta', 'Morocco', 'Puerto Rico', 'Jersey', 'Chile', 'Lebanon',
       'Gibraltar', 'Greenland', 'New Zealand', 'Macao', 'Liechtenstein',
       'Benin', 'Iceland', 'Liberia', 'Georgia', 'Austria', 'Armenia',
       'Wallis and Futuna', 'El Salvador', 'Montserrat', 'Oman',
       'Algeria', 'Guinea-Bissau', 'Bolivi

In [110]:
dataset['paises'].nunique()

243

## Valores Faltantes

In [124]:
serie_faltante = pd.Series([1,2,3,np.nan,4,5, np.nan])

In [125]:
serie_faltante.isna()

0    False
1    False
2    False
3     True
4    False
5    False
6     True
dtype: bool

In [126]:
serie_faltante.isna().sum()

2

In [127]:
serie_faltante.value_counts(dropna=False)

NaN    2
1.0    1
2.0    1
3.0    1
4.0    1
5.0    1
Name: count, dtype: int64

### preenchimento

In [128]:
serie_faltante.fillna(serie_faltante.mean()) ## método mais adequado

0    1.0
1    2.0
2    3.0
3    3.0
4    4.0
5    5.0
6    3.0
dtype: float64

In [119]:
serie_faltante.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    4.0
5    5.0
6    0.0
dtype: float64

In [122]:
serie_faltante.dropna()

0    1.0
1    2.0
2    3.0
4    4.0
5    5.0
dtype: float64

In [136]:
serie_categorica = pd.Series(['Uva','Banana','Uva',np.nan,'Melancia',np.nan])

In [141]:
serie_categorica.fillna(serie_categorica.mode().iloc[0])

0         Uva
1      Banana
2         Uva
3         Uva
4    Melancia
5         Uva
dtype: object

## Funções

In [143]:
dataset['age'].loc[dataset['age'] < 18]

106      17
209      17
262      17
271      17
335      17
         ..
31772    17
31864    17
31959    17
32282    17
32447    17
Name: age, Length: 395, dtype: int64

In [165]:
def corrige_idade(idade):
    if idade < 18:
        idade = 18 
    return idade

In [166]:
dataset['age'] = dataset['age'].apply(corrige_idade)

In [167]:
dataset['age'].loc[dataset['age'] < 18]

Series([], Name: age, dtype: int64)

### lambda

In [163]:
dataset['age'] = dataset['age'].apply(lambda x: 17 if x == 18 else x)

In [164]:
dataset['age'].loc[dataset['age'] < 18]

51       17
78       17
80       17
106      17
168      17
         ..
32345    17
32392    17
32443    17
32447    17
32496    17
Name: age, Length: 945, dtype: int64

In [185]:
dataset['age'] = dataset['age'].apply(lambda x: 0 if x <= 40 else x)

TypeError: '<=' not supported between instances of 'str' and 'int'

In [172]:
dataset['age'].loc[dataset['age'] > 80]

TypeError: '>' not supported between instances of 'str' and 'int'

In [181]:
dataset['age'].loc[dataset['age']==0]

14       0
20       0
76       0
121      0
183      0
        ..
32376    0
32388    0
32451    0
32514    0
32557    0
Name: age, Length: 794, dtype: object

In [194]:
dataset['age'].loc[dataset['age'] == 'dinossauro']

222      dinossauro
918      dinossauro
1040     dinossauro
1168     dinossauro
1935     dinossauro
            ...    
32277    dinossauro
32367    dinossauro
32459    dinossauro
32494    dinossauro
32525    dinossauro
Name: age, Length: 99, dtype: object

In [195]:
dataset['age'].where(dataset['age'] == 'dinossauro',0)  

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    0
32558    0
32559    0
32560    0
Name: age, Length: 32561, dtype: object