<a href="https://colab.research.google.com/github/FGalvao77/-DICAS-Uso-de-ferramentas-para-Data-Science-e-afins/blob/main/%5BDICAS%5D_Como_iterar_linhas_no_dataframe_com_uso_do_PANDAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**[DICAS] - Como `iterar` linhas no dataframe com uso do PANDAS**
---


Vamos conhecer algumas formas de como _iterar_ as linhas de um conjunto de dados utilizando o **PANDAS** e, dentre as possibilidades avaliar o desempenho de cada técnica utilizada.

In [1]:
# importando a bibliotecas necessárias
import pandas as pd
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [2]:
# instanciando o conjunto de dados "diamonds" presente no módulo "load_dataset" do seaborn
df = sns.load_dataset('diamonds')

In [3]:
# visualizando as 5 primeiras observações
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
# informações gerais
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


**iterrows():**

In [5]:
# vamos realizar um "for" e utilizar o método "iterrows()" para iterar a linhas do conjunto de dados
for i in df.iterrows():
    print(i)
    break  # realizando um "break" na primeira iteração

(0, carat       0.23
cut        Ideal
color          E
clarity      SI2
depth       61.5
table       55.0
price        326
x           3.95
y           3.98
z           2.43
Name: 0, dtype: object)


In [6]:
# consultando o índice
i[0]

0

In [7]:
# consultado os atributos e seus respectivos valores
i[1]

carat       0.23
cut        Ideal
color          E
clarity      SI2
depth       61.5
table       55.0
price        326
x           3.95
y           3.98
z           2.43
Name: 0, dtype: object

In [8]:
# instanciando o índice e os atributos/valores em objetos distintos
for idx, row in df.iterrows():
    break

In [9]:
# índice
idx

0

In [10]:
# atributos/valores
row

carat       0.23
cut        Ideal
color          E
clarity      SI2
depth       61.5
table       55.0
price        326
x           3.95
y           3.98
z           2.43
Name: 0, dtype: object

Dentre as formas possivéis de iteração, agora iremos avaliar a performance de cada uma.

In [23]:
# contabilizando os valores únicos presente na coluna "cut"
df['cut'].value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

In [22]:
# realizando um "for" e o método ".iterrows()" para iterar cada linha e, visualizando o seu tempo de execução
%time

condition = []
for idx, row in df.iterrows():
    condition.append('cheap' if row['price'] <= 1_000 else 'expensive')

df['evaluation'] = condition
df

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,evaluation,test
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap,good
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap,good
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap,medium
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,cheap,good
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,expensive,good
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,expensive,medium
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,expensive,medium
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,expensive,good


In [13]:
# instanciando uma coluna (test) vazia no conjunto de dados para realizar o seu preenchimento 
df['test'] = ''
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,evaluation,test
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap,
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap,
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap,
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,cheap,
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap,
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,expensive,
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,expensive,
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,expensive,
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,expensive,


In [14]:
# realizando um "for" e o método ".iloc[]"
%time

for row in range(len(df)):
    if df['cut'].iloc[row] in ['Ideal', 'Premium']:
        df['test'].iloc[row] = 'great'
    elif df['cut'].iloc[row] in ['Very Good', 'Good']:
        df['test'].iloc[row] = 'medium'
    else:
        df['test'].iloc[row] = 'poor'

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


In [15]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,evaluation,test
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap,great
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap,great
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap,medium
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,cheap,great
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,expensive,great
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,expensive,medium
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,expensive,medium
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,expensive,great


In [16]:
# esse código se assemelha com o anterior, porém estamos instanciando cada linha e atributos/valores em objetos distintos
# e salvando o resultado em uma lista para no final realizar a inserção dos dados na coluna "test"
df['test'] = ''

%time
l = []

for idx, row in df.iterrows():
    if row['cut'] in ['Ideal', 'Premium']:
        l.append('good')
    elif row['cut'] in ['Very Good', 'Good']:
        l.append('medium')
    else:
        l.append('poor')

df['test'] = l
df

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,evaluation,test
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap,good
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap,good
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap,medium
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,cheap,good
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,expensive,good
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,expensive,medium
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,expensive,medium
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,expensive,good


In [17]:
# definindo uma função para realizar a iteração
df['test'] = ''

def iter_rows(df):
    if df['cut'] in ['Ideal', 'Premium']:
        return 'good'
    elif df['cut'] in ['Very Good', 'Good']:
        return 'medium'
    else:
        return 'poor'

In [18]:
# utilizando o método ".apply()" e passando como argumento a função acima e "axis=1"
%time
df['test'] = df.apply(iter_rows, axis=1)
df

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,evaluation,test
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap,good
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap,good
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap,medium
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,cheap,good
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,expensive,good
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,expensive,medium
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,expensive,medium
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,expensive,good


In [19]:
# utilizando o método ".itertuples()" para realizarmos a iteração e avaliar a sua performance
df['test'] = ''

%time
l = []
for row in df.itertuples():
    if row.cut in ['Ideal', 'Premium']:
        l.append('good')
    elif row.cut in ['Very Good', 'Good']:
        l.append('medium')
    else:
        l.append('poor')

df['test'] = l
df

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,evaluation,test
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap,good
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap,good
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap,medium
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,cheap,good
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,expensive,good
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,expensive,medium
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,expensive,medium
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,expensive,good


In [20]:
# fazendo o uso do método ".loc[]" e juntamente com ".isin()"
df['test'] = ''

%time
df['test'] = 'poor'
df.loc[df['cut'].isin(['Ideal', 'Premium']), 'test'] = 'good'
df.loc[df['cut'].isin(['Very Good', 'Good']), 'test'] = 'medium'

df

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,evaluation,test
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap,good
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap,good
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap,medium
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,cheap,good
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,expensive,good
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,expensive,medium
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,expensive,medium
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,expensive,good


In [21]:
# fazendo o uso dos métodos: ".loc[]", "values" e ".isin()"
df['test'] = ''

%time
df['test'] = 'poor'
df.loc[df['cut'].values.isin(['Ideal', 'Premium']), 'test'] = 'good'
df.loc[df['cut'].values.isin(['Very Good', 'Good']), 'test'] = 'medium'

df

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,evaluation,test
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,cheap,good
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,cheap,good
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,cheap,medium
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,cheap,good
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,cheap,medium
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,expensive,good
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,expensive,medium
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,expensive,medium
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,expensive,good
