# Limpeza e preparação de dados para análise

Este notebook tem a por finalidade o treinamento e aprendizado da preparação de um DataSet para posterior análise de dados.

In [198]:
# importando bibliotecas
import pandas as pd
import numpy as np

## Carregando os dados

In [199]:
# Lendo os dados
open_masc = pd.read_csv('data/men_main_21.1.csv', sep=',', low_memory=False)

## Visualizando dados
Essa visualização é importante para compreender o que precisa ser feito no dataset

In [200]:
# Primeira visualização do Dataframe
open_masc.head()

Unnamed: 0.1,Unnamed: 0,first_name,lastname,Placing,country,continent,Age,affiliate,Height and Weight,points,21.1,21.1 time,21.2,21.2 time,21.3,21.3 time,21.4,weight,Unnamed: 18
0,0,JEFFRE,ADLER,1,Canada,North America,Age 27,CrossFit Wonderland,69 in | 197 lb,101.0,20th (11:55),605 reps,8th (9:14),225 reps,27th (8:15),180 reps,46th (317 lbs),Weight lifted: 317 lbs.,
1,1,SCOTT,PANCHIK,2,United States,North America,Age 33,CrossFit Mentality,69 in | 187 lb,141.0,33rd (12:25),605 reps,47th (9:52),225 reps,5th (7:48),180 reps,56th (316 lbs),Weight lifted: 316 lbs.,
2,2,TRAVIS,MEAD,3,United States,North America,Age 34,Iron Valley CrossFit,73 in | 205 lb,165.0,87th (13:02),605 reps,24th (9:38),225 reps,48th (8:26),180 reps,6th (345 lbs),Weight lifted: 345 lbs.,
3,3,SAXON,PANCHIK,4,United States,North America,Age 25,CrossFit Cliffside,69 in | 180 lb,217.0,5th (11:25),605 reps,68th (9:59),225 reps,87th (8:41),180 reps,57th (316 lbs),Weight lifted: 316 lbs.,
4,4,RICHARD,FRONING JR.,5,United States,North America,Age 33,CrossFit Mayhem,69 in | 194 lb,254.0,58th (12:45),605 reps,91st (10:04),225 reps,5th (7:48),180 reps,100th (312 lbs),Weight lifted: 312 lbs.,


In [201]:
# Verificando o "tamanho" do df
open_masc.shape

(137464, 19)

In [202]:
open_masc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137464 entries, 0 to 137463
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         137464 non-null  int64  
 1   first_name         137463 non-null  object 
 2   lastname           137456 non-null  object 
 3   Placing            137464 non-null  object 
 4   country            137464 non-null  object 
 5   continent          137464 non-null  object 
 6   Age                137464 non-null  object 
 7   affiliate          112681 non-null  object 
 8   Height and Weight  87327 non-null   object 
 9   points             137463 non-null  float64
 10  21.1               137464 non-null  object 
 11  21.1 time          8238 non-null    object 
 12  21.2               137462 non-null  object 
 13  21.2 time          65789 non-null   object 
 14  21.3               137461 non-null  object 
 15  21.3 time          15202 non-null   object 
 16  21

In [203]:
# Primeira descrição dos dados numéricos, 
# verifica-se que algumas colunas de valores estão em string e 
# precisam ser convertidos para possibilitar que os calculos sejam feitos
open_masc.describe()

Unnamed: 0.1,Unnamed: 0,points
count,137464.0,137463.0
mean,68731.5,266044.715029
std,39682.583039,132747.169208
min,0.0,101.0
25%,34365.75,154847.0
50%,68731.5,274493.0
75%,103097.25,386686.5
max,137463.0,467118.0


In [204]:
# Verificando quais são as colunas originais do df
open_masc.columns

Index(['Unnamed: 0', 'first_name', 'lastname', 'Placing', 'country',
       'continent', 'Age', 'affiliate', 'Height and Weight', 'points', '21.1',
       '21.1 time', '21.2', '21.2 time', '21.3', '21.3 time', '21.4', 'weight',
       'Unnamed: 18'],
      dtype='object')

## Tratando colunas

In [205]:
open_masc =  open_masc.rename(columns={'weight': 'weight_lifted', 'Age': 'age', '21.2 time': '21.2 reps', '21.1 time': '21.1 reps', 'Placing':'placing'})
open_masc.columns

Index(['Unnamed: 0', 'first_name', 'lastname', 'placing', 'country',
       'continent', 'age', 'affiliate', 'Height and Weight', 'points', '21.1',
       '21.1 reps', '21.2', '21.2 reps', '21.3', '21.3 time', '21.4',
       'weight_lifted', 'Unnamed: 18'],
      dtype='object')

### Separando as colunas de altura e peso do atleta

#### Altura

In [206]:
open_masc['height'] = open_masc['Height and Weight']

In [207]:
def define_height(value):
    x = None
    if type(value) == str:
        a = value.split()
        if a[1] == 'in':
            x = int(a[0]) * 2.54
        elif a[1] == 'cm':
            x = int(a[0])
    return x

In [208]:
open_masc['height'] = open_masc['height'].apply(define_height)

#### Peso do atleta

In [209]:
open_masc['weight'] = open_masc['Height and Weight']

In [210]:
def define_weight(value):
    x = None
    if type(value) == str:
        a = value.split()
        if a[-1] == 'lb':
            x = int(a[-2]) / 2.2
        elif a[-1] == 'kg':
            x = int(a[-2])
    return x

In [211]:
open_masc['weight'] = open_masc['weight'].apply(define_weight)

Removendo a coluna de peso e altura 

In [212]:
open_masc.drop(['Height and Weight'], axis=1, inplace=True)

### Transformando a coluna de idade em numérico

Durante o processo para realizar a transformação da coluna idade (de string para numérico) observou-se inconsistência nos dados da linha 57645, optando-se pela remoção da mesma

In [213]:
open_masc.iloc[57645]

Unnamed: 0                         57645
first_name                         NICOL
lastname                             NaN
placing                             MARC
country                            57646
continent                 United Kingdom
age                               Europe
affiliate                         Age 41
points                               NaN
21.1                              237470
21.1 reps             48384th (236 reps)
21.2                                 NaN
21.2 reps             63386th (214 reps)
21.3                                 NaN
21.3 time             68919th (126 reps)
21.4                                 NaN
weight_lifted          56781st (176 lbs)
Unnamed: 18      Weight lifted: 176 lbs.
height                               NaN
weight                               NaN
Name: 57645, dtype: object

In [214]:
open_masc.drop(labels=57645, axis=0, inplace=True, errors='raise')

In [215]:
def transforma_idade(texto):
    if texto == "":
        x = 0
    else:
        texto.split(" ")
        x = texto[1]
    return(texto[-2:])

In [216]:
open_masc['age'] = open_masc['age'].apply(transforma_idade)


In [217]:
# Convertendo a coluna age (idade) de 'string' para numérico
open_masc['age'] = pd.to_numeric(open_masc['age'])
open_masc.head(10)

Unnamed: 0.1,Unnamed: 0,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.1 reps,21.2,21.2 reps,21.3,21.3 time,21.4,weight_lifted,Unnamed: 18,height,weight
0,0,JEFFRE,ADLER,1,Canada,North America,27,CrossFit Wonderland,101.0,20th (11:55),605 reps,8th (9:14),225 reps,27th (8:15),180 reps,46th (317 lbs),Weight lifted: 317 lbs.,,175.26,89.545455
1,1,SCOTT,PANCHIK,2,United States,North America,33,CrossFit Mentality,141.0,33rd (12:25),605 reps,47th (9:52),225 reps,5th (7:48),180 reps,56th (316 lbs),Weight lifted: 316 lbs.,,175.26,85.0
2,2,TRAVIS,MEAD,3,United States,North America,34,Iron Valley CrossFit,165.0,87th (13:02),605 reps,24th (9:38),225 reps,48th (8:26),180 reps,6th (345 lbs),Weight lifted: 345 lbs.,,185.42,93.181818
3,3,SAXON,PANCHIK,4,United States,North America,25,CrossFit Cliffside,217.0,5th (11:25),605 reps,68th (9:59),225 reps,87th (8:41),180 reps,57th (316 lbs),Weight lifted: 316 lbs.,,175.26,81.818182
4,4,RICHARD,FRONING JR.,5,United States,North America,33,CrossFit Mayhem,254.0,58th (12:45),605 reps,91st (10:04),225 reps,5th (7:48),180 reps,100th (312 lbs),Weight lifted: 312 lbs.,,175.26,88.181818
5,5,NOAH,OHLSEN,6,United States,North America,30,Peak 360 CrossFit,272.0,11th (11:41),605 reps,21st (9:33),225 reps,2nd (7:44),180 reps,238th (301 lbs),Weight lifted: 301 lbs.,,170.18,86.363636
6,6,SAMUEL,COURNOYER,7,Canada,North America,25,CrossFit Mayhem,276.0,158th (13:27),605 reps,79th (10:02),225 reps,20th (8:09),180 reps,19th (328 lbs),Weight lifted: 328 lbs.,,180.0,93.181818
7,7,COLE,GREASHABER,8,United States,North America,21,CrossFit Lee's Summit (LS),403.0,82nd (13:01),605 reps,68th (9:59),225 reps,46th (8:25),180 reps,207th (303 lbs),Weight lifted: 303 lbs.,,182.88,93.636364
8,8,AGUSTIN,RICHELME,9,Argentina,South America,24,,417.0,30th (12:19),605 reps,137th (10:15),225 reps,1st (7:40),180 reps,249th (300 lbs),Weight lifted: 300 lbs.,,173.0,83.0
9,9,OLEG,LASCENKO,10,United Kingdom,Europe,23,CrossFit Abu Dhabi,428.0,53rd (12:43),605 reps,22nd (9:34),225 reps,102nd (8:46),180 reps,251st (300 lbs),Weight lifted: 300 lbs.,,175.0,92.0


##### Transformando em numérico as colunas com números de reps 

In [220]:
def remove_reps(t):
    t.replace(" reps", "")
    print(t)
    return t

In [221]:
open_masc['21.2 reps'] = open_masc['21.2 reps'].apply(remove_reps)

225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
225 reps
2

AttributeError: 'float' object has no attribute 'replace'

In [None]:
open_masc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137463 entries, 0 to 137463
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     137463 non-null  int64  
 1   first_name     137462 non-null  object 
 2   lastname       137456 non-null  object 
 3   placing        137463 non-null  object 
 4   country        137463 non-null  object 
 5   continent      137463 non-null  object 
 6   age            137463 non-null  int64  
 7   affiliate      112680 non-null  object 
 8   points         137463 non-null  float64
 9   21.1           137463 non-null  object 
 10  21.1 reps      8237 non-null    object 
 11  21.2           137462 non-null  object 
 12  21.2 reps      65788 non-null   object 
 13  21.3           137461 non-null  object 
 14  21.3 time      15201 non-null   object 
 15  21.4           137461 non-null  object 
 16  weight_lifted  100593 non-null  object 
 17  Unnamed: 18    2 non-null    

### Limpeza os dados

In [None]:
# Conferindo os continentes listados, 
# verificou-se a necessidade de remover dados inconsistentes
continent = np.array(open_masc['continent'])
unique_continent = np.unique(continent)
print(unique_continent)

['Africa' 'Asia' 'Europe' 'North America' 'Oceania' 'South America']


In [None]:
# Visualizando todos os países listados
# Verificou-se a necessidade de remover uma linha com dados inconsistentes
country = np.array(open_masc['country'])
unique_country = np.unique(country)
print("Total de países participantes: ", len(unique_country))
print(unique_country)

Total de países participantes:  156
['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Argentina'
 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain'
 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Cambodia' 'Cameroon' 'Canada' 'Cape Verde' 'Chile' 'China'
 'Colombia' 'Congo, The Democratic Republic of the' 'Costa Rica' 'Croatia'
 'Cyprus' 'Czech Republic' "Côte d'Ivoire" 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Estonia' 'Ethiopia'
 'Fiji' 'Finland' 'France' 'Georgia' 'Germany' 'Ghana' 'Greece'
 'Guatemala' 'Guinea' 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland'
 'India' 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica'
 'Japan' 'Jordan' 'Kazakhstan' 'Kenya' 'Korea, Republic of' 'Kosovo'
 'Kuwait' 'Kyrgyzstan' 'Laos' 'Latvia' 'Lebanon' 'Libya' 'Liechtenstein'
 'Lithuania' 'Luxembourg' 'Macedonia' 'Madagascar

In [None]:
open_masc.isnull().sum()

Unnamed: 0            0
first_name            1
lastname              7
placing               0
country               0
continent             0
age                   0
affiliate         24783
points                0
21.1                  0
21.1 reps        129226
21.2                  1
21.2 reps         71675
21.3                  2
21.3 time        122262
21.4                  2
weight_lifted     36870
Unnamed: 18      137461
height            55349
weight            50962
dtype: int64

#### Seleção de colunas

In [None]:
# Novo Dataframe apenas com as colunas que serão analisadas
df_masc = open_masc[['id', 'first_name', 'lastname', 'height', 'weight', 'country', 'continent',
       'age', 'affiliate', 'points', '21.1',
       '21.2', '21.3', '21.4', 'weight_lifted']]
df_masc[100500:100510]

KeyError: "['id'] not in index"

#### Países e continentes

In [None]:
# removendo inconsistencias de paises e continentes
filtro1 = df_masc['country'] != '57646'
df_masc = df_masc[filtro1]

#### Idades

Unnamed: 0,id,first_name,lastname,placing,country,continent,age,affiliate,height,weight,points,21.1,21.2,21.3,21.4,weight_lifted
0,0,JEFFRE,ADLER,1,Canada,North America,27,CrossFit Wonderland,69 in,197 lb,101,20th (11:55),8th (9:14),27th (8:15),46th (317 lbs),317
1,1,SCOTT,PANCHIK,2,United States,North America,33,CrossFit Mentality,69 in,187 lb,141,33rd (12:25),47th (9:52),5th (7:48),56th (316 lbs),316
2,2,TRAVIS,MEAD,3,United States,North America,34,Iron Valley CrossFit,73 in,205 lb,165,87th (13:02),24th (9:38),48th (8:26),6th (345 lbs),345
3,3,SAXON,PANCHIK,4,United States,North America,25,CrossFit Cliffside,69 in,180 lb,217,5th (11:25),68th (9:59),87th (8:41),57th (316 lbs),316
4,4,RICHARD,FRONING JR.,5,United States,North America,33,CrossFit Mayhem,69 in,194 lb,254,58th (12:45),91st (10:04),5th (7:48),100th (312 lbs),312
5,5,NOAH,OHLSEN,6,United States,North America,30,Peak 360 CrossFit,67 in,190 lb,272,11th (11:41),21st (9:33),2nd (7:44),238th (301 lbs),301
6,6,SAMUEL,COURNOYER,7,Canada,North America,25,CrossFit Mayhem,180 cm,205 lb,276,158th (13:27),79th (10:02),20th (8:09),19th (328 lbs),328
7,7,COLE,GREASHABER,8,United States,North America,21,CrossFit Lee's Summit (LS),72 in,206 lb,403,82nd (13:01),68th (9:59),46th (8:25),207th (303 lbs),303
8,8,AGUSTIN,RICHELME,9,Argentina,South America,24,,173 cm,83 kg,417,30th (12:19),137th (10:15),1st (7:40),249th (300 lbs),300
9,9,OLEG,LASCENKO,10,United Kingdom,Europe,23,CrossFit Abu Dhabi,175 cm,92 kg,428,53rd (12:43),22nd (9:34),102nd (8:46),251st (300 lbs),300


#### peso levantado

In [None]:
# Preenchendo pesos nullos com valor zero
df_masc['weight_lifted'] = df_masc['weight_lifted'].fillna('0')
df_masc.head(10)

Unnamed: 0,id,first_name,lastname,placing,country,continent,age,affiliate,height,weight,points,21.1,21.2,21.3,21.4,weight_lifted
0,0,JEFFRE,ADLER,1,Canada,North America,27,CrossFit Wonderland,69 in,197 lb,101,20th (11:55),8th (9:14),27th (8:15),46th (317 lbs),317
1,1,SCOTT,PANCHIK,2,United States,North America,33,CrossFit Mentality,69 in,187 lb,141,33rd (12:25),47th (9:52),5th (7:48),56th (316 lbs),316
2,2,TRAVIS,MEAD,3,United States,North America,34,Iron Valley CrossFit,73 in,205 lb,165,87th (13:02),24th (9:38),48th (8:26),6th (345 lbs),345
3,3,SAXON,PANCHIK,4,United States,North America,25,CrossFit Cliffside,69 in,180 lb,217,5th (11:25),68th (9:59),87th (8:41),57th (316 lbs),316
4,4,RICHARD,FRONING JR.,5,United States,North America,33,CrossFit Mayhem,69 in,194 lb,254,58th (12:45),91st (10:04),5th (7:48),100th (312 lbs),312
5,5,NOAH,OHLSEN,6,United States,North America,30,Peak 360 CrossFit,67 in,190 lb,272,11th (11:41),21st (9:33),2nd (7:44),238th (301 lbs),301
6,6,SAMUEL,COURNOYER,7,Canada,North America,25,CrossFit Mayhem,180 cm,205 lb,276,158th (13:27),79th (10:02),20th (8:09),19th (328 lbs),328
7,7,COLE,GREASHABER,8,United States,North America,21,CrossFit Lee's Summit (LS),72 in,206 lb,403,82nd (13:01),68th (9:59),46th (8:25),207th (303 lbs),303
8,8,AGUSTIN,RICHELME,9,Argentina,South America,24,,173 cm,83 kg,417,30th (12:19),137th (10:15),1st (7:40),249th (300 lbs),300
9,9,OLEG,LASCENKO,10,United Kingdom,Europe,23,CrossFit Abu Dhabi,175 cm,92 kg,428,53rd (12:43),22nd (9:34),102nd (8:46),251st (300 lbs),300


In [None]:
# Iniciando o processo para converter weight_lift para numerico
# Função para remover a string 'lbs' e converter peso para quilos
def substitui_lbs(value):
    x = value.replace(' lbs', '')
    x = float(x) / 2.205
    return(x)

In [None]:
# Função para remover a string 'kg'
def substitui_kg(value):
    x = value.replace(' kg', '')
    return(x)

In [None]:
df_masc['weight_lifted'] = df_masc['weight_lifted'].apply(substitui_kg)


In [None]:
#Funaçao para substituir valores inválidos por peso zero
def remove_peso_invalido(value):
    if ('(--)' in value):
        value = '0'
    return value

In [None]:
df_masc['weight_lifted'] = df_masc['weight_lifted'].apply(remove_peso_invalido)

In [None]:
df_masc['weight_lifted'] = df_masc['weight_lifted'].apply(substitui_lbs)


In [None]:
# Convertendo a coluna weight lifted de 'string' para numérico
df_masc['weight_lifted'] = pd.to_numeric(df_masc['weight_lifted'])

In [None]:
df_masc['weight_lifted']

0         143.764172
1         143.310658
2         156.462585
3         143.310658
4         141.496599
             ...    
137459      0.000000
137460      0.000000
137461      0.000000
137462      0.000000
137463      0.000000
Name: weight_lifted, Length: 137463, dtype: float64

#### pontos

In [None]:
df_masc.head(5)

Unnamed: 0,id,first_name,lastname,placing,country,continent,age,affiliate,height,weight,points,21.1,21.2,21.3,21.4,weight_lifted
0,0,JEFFRE,ADLER,1,Canada,North America,27,CrossFit Wonderland,69 in,197 lb,101,20th (11:55),8th (9:14),27th (8:15),46th (317 lbs),143.764172
1,1,SCOTT,PANCHIK,2,United States,North America,33,CrossFit Mentality,69 in,187 lb,141,33rd (12:25),47th (9:52),5th (7:48),56th (316 lbs),143.310658
2,2,TRAVIS,MEAD,3,United States,North America,34,Iron Valley CrossFit,73 in,205 lb,165,87th (13:02),24th (9:38),48th (8:26),6th (345 lbs),156.462585
3,3,SAXON,PANCHIK,4,United States,North America,25,CrossFit Cliffside,69 in,180 lb,217,5th (11:25),68th (9:59),87th (8:41),57th (316 lbs),143.310658
4,4,RICHARD,FRONING JR.,5,United States,North America,33,CrossFit Mayhem,69 in,194 lb,254,58th (12:45),91st (10:04),5th (7:48),100th (312 lbs),141.496599


In [None]:
df_masc['points']

0                   101
1                   141
2                   165
3                   217
4                   254
              ...      
137459    117347th (--)
137460    117347th (--)
137461    117347th (--)
137462          467.118
137463    117347th (--)
Name: points, Length: 137463, dtype: object

In [None]:
# Inicio do processo para converter points para numérico
def remove_pontos_invalidos(value):
    if ('lb' in value):
        x = "0"
    elif ('kg' in value):
        x = '0'
    elif ('t' in value):
        x = '0'
    elif ('d' in value):
        x = '0'
    else:
        x = value
    return x

In [None]:
df_masc.head(2)

Unnamed: 0,id,first_name,lastname,placing,country,continent,age,affiliate,height,weight,points,21.1,21.2,21.3,21.4,weight_lifted
0,0,JEFFRE,ADLER,1,Canada,North America,27,CrossFit Wonderland,69 in,197 lb,101,20th (11:55),8th (9:14),27th (8:15),46th (317 lbs),143.764172
1,1,SCOTT,PANCHIK,2,United States,North America,33,CrossFit Mentality,69 in,187 lb,141,33rd (12:25),47th (9:52),5th (7:48),56th (316 lbs),143.310658


In [None]:
df_masc['points'] = df_masc['points'].apply(remove_pontos_invalidos)
# df_masc['points']

In [None]:
# Convertendo a coluna points lifted de 'string' para numérico
df_masc['points'] = pd.to_numeric(df_masc['points'])
df_masc['points']

0         101.000
1         141.000
2         165.000
3         217.000
4         254.000
           ...   
137459      0.000
137460      0.000
137461      0.000
137462    467.118
137463      0.000
Name: points, Length: 137463, dtype: float64

In [None]:
df_masc['points'] = df_masc['points'].astype(float)
df_masc['points']

0         101.000
1         141.000
2         165.000
3         217.000
4         254.000
           ...   
137459      0.000
137460      0.000
137461      0.000
137462    467.118
137463      0.000
Name: points, Length: 137463, dtype: float64

In [None]:
df_masc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137463 entries, 0 to 137463
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             137463 non-null  int64  
 1   first_name     137462 non-null  object 
 2   lastname       137456 non-null  object 
 3   placing        137463 non-null  object 
 4   country        137463 non-null  object 
 5   continent      137463 non-null  object 
 6   age            137463 non-null  int64  
 7   affiliate      112680 non-null  object 
 8   height         87194 non-null   object 
 9    weight        119239 non-null  object 
 10  points         137463 non-null  float64
 11  21.1           103061 non-null  object 
 12  21.2           106707 non-null  object 
 13  21.3           101173 non-null  object 
 14  21.4           117636 non-null  object 
 15  weight_lifted  137463 non-null  float64
dtypes: float64(2), int64(2), object(12)
memory usage: 17.8+ MB


#### Removendo linhas nulas

In [None]:
df_masc.isna().sum()

id                   0
first_name           1
lastname             7
placing              0
country              0
continent            0
age                  0
affiliate        24783
height           50269
 weight          18224
points               0
21.1             34402
21.2             30756
21.3             36290
21.4             19827
weight_lifted        0
dtype: int64

In [None]:
# Encontrando quem tem first_name nulo
df_masc[pd.isna(df_masc['first_name'])]

Unnamed: 0,id,first_name,lastname,placing,country,continent,age,affiliate,height,weight,points,21.1,21.2,21.3,21.4,weight_lifted
93138,93138,,YEONGDON,93.139,"Korea, Republic of",Asia,28,CrossFit 660,,359096,0.0,,225,,,0.0


In [None]:
# Encontrando quem tem lastname nulo
df_masc[pd.isna(df_masc['lastname'])]

Unnamed: 0,id,first_name,lastname,placing,country,continent,age,affiliate,height,weight,points,21.1,21.2,21.3,21.4,weight_lifted
2040,2040,HYEONJONG,,2.041,"Korea, Republic of",Asia,31,CrossFit Zest,172 lb,,17.676,3890th (441),3593rd (12:29),5047th (12:56),5146th (252 lbs),114.285714
7732,7732,YOUNGSEON,,7.733,"Korea, Republic of",Asia,23,Golden Crown CrossFit Seodaemun,,,48.734,5280th (395),10317th (13:59),10886th (14:54),22251st (215 lbs),97.505669
19017,19017,TAKSEOK,,19.018,"Korea, Republic of",Asia,41,CrossFit Geumchon,170 cm,75 kg,96.625,11332nd (387),40439th (18:10),22176th (155),22678th (215 lbs),97.505669
73598,73598,MICHAEL,,73.599,Canada,North America,51,CrossFit Calgary,,290309,0.0,,,,175,0.0
81080,81080,JONGMYUNG,,81.081,"Korea, Republic of",Asia,28,CrossFit Lagom,,315173,0.0,,,180,155,0.0
86324,86324,DAVID,,86.325,United States,North America,37,CrossFit East Oahu,70 in,195 lb,333.85,87458th (115),92161st (145),78825th (98),75406th (147 lbs),66.666667
133060,133060,BRIAN,,132.987,United States,North America,33,,69 in,165 lb,459.276,117347th (--),111374th (--),101151st (--),129404th (--),0.0


In [None]:
# Incluindo texto nos campos nulos de acordo com a coluna
values = {'lastname': '-', 'first_name': '-', 'affiliate': 'not_affiliate'}
df_masc.fillna(value = values, inplace=True) 

In [None]:
df_masc.isna().sum()
# Nenhum valor null

id                   0
first_name           0
lastname             0
placing              0
country              0
continent            0
age                  0
affiliate            0
height           50269
 weight          18224
points               0
21.1             34402
21.2             30756
21.3             36290
21.4             19827
weight_lifted        0
dtype: int64