# Limpeza e preparação de dados para análise

Este notebook tem a por finalidade a preparação de um DataSet para posterior análise de dados.

In [779]:
# importando bibliotecas
import pandas as pd
import numpy as np
from summarytools import dfSummary

## 1 - Carregando os dados

In [780]:
# Lendo os dados
open_masc = pd.read_csv('data/men_main_21.1.csv', sep=',', low_memory=False)

## 2 - Visualizando dados
Essa visualização é importante para compreender o que precisa ser feito no dataset.

Verifica-se que algumas colunas de valores estão em string e precisam ser convertidos para numéricos para possibilitar que os calculos sejam feitos.

Há uma coluna que condensa dois dados, peso e altura do atleta, deverá ser convertida em duas colunas.

Há outras colunas que condensam duas informações, por exemplo, a coluna '21.1' armazena dados da posição do atleta naquela prova e também do tempo em que o atleta concluiu a prova. *Vamos ficar apenas com a posição do atleta na prova*.


In [781]:
# Primeira visualização do Dataframe
open_masc.head()

Unnamed: 0.1,Unnamed: 0,first_name,lastname,Placing,country,continent,Age,affiliate,Height and Weight,points,21.1,21.1 time,21.2,21.2 time,21.3,21.3 time,21.4,weight,Unnamed: 18
0,0,JEFFRE,ADLER,1,Canada,North America,Age 27,CrossFit Wonderland,69 in | 197 lb,101.0,20th (11:55),605 reps,8th (9:14),225 reps,27th (8:15),180 reps,46th (317 lbs),Weight lifted: 317 lbs.,
1,1,SCOTT,PANCHIK,2,United States,North America,Age 33,CrossFit Mentality,69 in | 187 lb,141.0,33rd (12:25),605 reps,47th (9:52),225 reps,5th (7:48),180 reps,56th (316 lbs),Weight lifted: 316 lbs.,
2,2,TRAVIS,MEAD,3,United States,North America,Age 34,Iron Valley CrossFit,73 in | 205 lb,165.0,87th (13:02),605 reps,24th (9:38),225 reps,48th (8:26),180 reps,6th (345 lbs),Weight lifted: 345 lbs.,
3,3,SAXON,PANCHIK,4,United States,North America,Age 25,CrossFit Cliffside,69 in | 180 lb,217.0,5th (11:25),605 reps,68th (9:59),225 reps,87th (8:41),180 reps,57th (316 lbs),Weight lifted: 316 lbs.,
4,4,RICHARD,FRONING JR.,5,United States,North America,Age 33,CrossFit Mayhem,69 in | 194 lb,254.0,58th (12:45),605 reps,91st (10:04),225 reps,5th (7:48),180 reps,100th (312 lbs),Weight lifted: 312 lbs.,


In [782]:
# Verificando o "tamanho" do df
open_masc.shape

(137464, 19)

In [783]:
open_masc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137464 entries, 0 to 137463
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         137464 non-null  int64  
 1   first_name         137463 non-null  object 
 2   lastname           137456 non-null  object 
 3   Placing            137464 non-null  object 
 4   country            137464 non-null  object 
 5   continent          137464 non-null  object 
 6   Age                137464 non-null  object 
 7   affiliate          112681 non-null  object 
 8   Height and Weight  87327 non-null   object 
 9   points             137463 non-null  float64
 10  21.1               137464 non-null  object 
 11  21.1 time          8238 non-null    object 
 12  21.2               137462 non-null  object 
 13  21.2 time          65789 non-null   object 
 14  21.3               137461 non-null  object 
 15  21.3 time          15202 non-null   object 
 16  21

In [784]:
# Primeira descrição dos dados numéricos, 
open_masc.describe()

Unnamed: 0.1,Unnamed: 0,points
count,137464.0,137463.0
mean,68731.5,266044.715029
std,39682.583039,132747.169208
min,0.0,101.0
25%,34365.75,154847.0
50%,68731.5,274493.0
75%,103097.25,386686.5
max,137463.0,467118.0


In [785]:
# Verificando quais são as colunas originais do df
open_masc.columns

Index(['Unnamed: 0', 'first_name', 'lastname', 'Placing', 'country',
       'continent', 'Age', 'affiliate', 'Height and Weight', 'points', '21.1',
       '21.1 time', '21.2', '21.2 time', '21.3', '21.3 time', '21.4', 'weight',
       'Unnamed: 18'],
      dtype='object')

## 3 - Tratando colunas

### 3.1 - Renomeando as colunas

In [786]:
open_masc =  open_masc.rename(columns={'Unnamed: 0': 'index', 'weight': 'weight_lifted', 'Age': 'age', 'Placing':'placing'})


In [787]:
open_masc = open_masc.drop(labels='Unnamed: 18', axis=1)
open_masc.columns

Index(['index', 'first_name', 'lastname', 'placing', 'country', 'continent',
       'age', 'affiliate', 'Height and Weight', 'points', '21.1', '21.1 time',
       '21.2', '21.2 time', '21.3', '21.3 time', '21.4', 'weight_lifted'],
      dtype='object')

### 3.2 - Separando as colunas de altura e peso do atleta
Ambas as colunas são dados numéricos.

Unidade de de medidas: peso(Kg), altura(cm)

#### Altura do atleta

In [788]:
open_masc['height'] = open_masc['Height and Weight']

In [789]:
def define_height(value):
    x = None
    if type(value) == str:
        a = value.split()
        if a[1] == 'in':
            x = int(a[0]) * 2.54
            x = round(x,2)
        elif a[1] == 'cm':
            x = int(a[0])
            x = round(x,2)
    return x

In [790]:
open_masc['height'] = open_masc['height'].apply(define_height)

#### Peso do atleta

In [791]:
open_masc['weight'] = open_masc['Height and Weight']

In [792]:
def define_weight(value):
    x = None
    if type(value) == str:
        a = value.split()
        if a[-1] == 'lb':
            x = int(a[-2]) / 2.2
            x = round(x,2)
        elif a[-1] == 'kg':
            x = int(a[-2])
            x = round(x,2)
    return x

In [793]:
open_masc['weight'] = open_masc['weight'].apply(define_weight)

Removendo a coluna de peso e altura 

In [794]:
open_masc.drop(['Height and Weight'], axis=1, inplace=True)

In [795]:
open_masc.head()

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.1 time,21.2,21.2 time,21.3,21.3 time,21.4,weight_lifted,height,weight
0,0,JEFFRE,ADLER,1,Canada,North America,Age 27,CrossFit Wonderland,101.0,20th (11:55),605 reps,8th (9:14),225 reps,27th (8:15),180 reps,46th (317 lbs),Weight lifted: 317 lbs.,175.26,89.55
1,1,SCOTT,PANCHIK,2,United States,North America,Age 33,CrossFit Mentality,141.0,33rd (12:25),605 reps,47th (9:52),225 reps,5th (7:48),180 reps,56th (316 lbs),Weight lifted: 316 lbs.,175.26,85.0
2,2,TRAVIS,MEAD,3,United States,North America,Age 34,Iron Valley CrossFit,165.0,87th (13:02),605 reps,24th (9:38),225 reps,48th (8:26),180 reps,6th (345 lbs),Weight lifted: 345 lbs.,185.42,93.18
3,3,SAXON,PANCHIK,4,United States,North America,Age 25,CrossFit Cliffside,217.0,5th (11:25),605 reps,68th (9:59),225 reps,87th (8:41),180 reps,57th (316 lbs),Weight lifted: 316 lbs.,175.26,81.82
4,4,RICHARD,FRONING JR.,5,United States,North America,Age 33,CrossFit Mayhem,254.0,58th (12:45),605 reps,91st (10:04),225 reps,5th (7:48),180 reps,100th (312 lbs),Weight lifted: 312 lbs.,175.26,88.18


### 3.3 - Transformando a coluna de idade em numérico

Durante o processo para realizar a transformação da coluna idade (de string para numérico) observou-se inconsistência nos dados da linha 57645, optando-se pela remoção da mesma

In [796]:
open_masc.iloc[57645]

index                         57645
first_name                    NICOL
lastname                        NaN
placing                        MARC
country                       57646
continent            United Kingdom
age                          Europe
affiliate                    Age 41
points                          NaN
21.1                         237470
21.1 time        48384th (236 reps)
21.2                            NaN
21.2 time        63386th (214 reps)
21.3                            NaN
21.3 time        68919th (126 reps)
21.4                            NaN
weight_lifted     56781st (176 lbs)
height                          NaN
weight                          NaN
Name: 57645, dtype: object

In [797]:
open_masc.drop(labels=57645, axis=0, inplace=True, errors='raise')

In [798]:
def transforma_idade(texto):
    if texto == "":
        x = 0
    else:
        texto.split(" ")
        x = texto[1]
    return(texto[-2:])

In [799]:
open_masc['age'] = open_masc['age'].apply(transforma_idade)


In [800]:
# Convertendo a coluna age (idade) de 'string' para numérico
open_masc['age'] = pd.to_numeric(open_masc['age'])


#### 3.4 - Tratando os resultados das provas 

In [801]:
open_masc = open_masc.drop(labels='21.1 time', axis=1)
open_masc = open_masc.drop(labels='21.2 time', axis=1)
open_masc = open_masc.drop(labels='21.3 time', axis=1)
open_masc.columns

Index(['index', 'first_name', 'lastname', 'placing', 'country', 'continent',
       'age', 'affiliate', 'points', '21.1', '21.2', '21.3', '21.4',
       'weight_lifted', 'height', 'weight'],
      dtype='object')

In [802]:
# Incluindo texto nos campos nulos de acordo com a coluna
values = {'21.1': '0', '21.2': '0', '21.3': '0', '21.4': '0'}
open_masc.fillna(value = values, inplace=True) 

In [803]:
open_masc.head()

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
0,0,JEFFRE,ADLER,1,Canada,North America,27,CrossFit Wonderland,101.0,20th (11:55),8th (9:14),27th (8:15),46th (317 lbs),Weight lifted: 317 lbs.,175.26,89.55
1,1,SCOTT,PANCHIK,2,United States,North America,33,CrossFit Mentality,141.0,33rd (12:25),47th (9:52),5th (7:48),56th (316 lbs),Weight lifted: 316 lbs.,175.26,85.0
2,2,TRAVIS,MEAD,3,United States,North America,34,Iron Valley CrossFit,165.0,87th (13:02),24th (9:38),48th (8:26),6th (345 lbs),Weight lifted: 345 lbs.,185.42,93.18
3,3,SAXON,PANCHIK,4,United States,North America,25,CrossFit Cliffside,217.0,5th (11:25),68th (9:59),87th (8:41),57th (316 lbs),Weight lifted: 316 lbs.,175.26,81.82
4,4,RICHARD,FRONING JR.,5,United States,North America,33,CrossFit Mayhem,254.0,58th (12:45),91st (10:04),5th (7:48),100th (312 lbs),Weight lifted: 312 lbs.,175.26,88.18


In [804]:
def define_posicao(p):
    p = p.split(" ")
    x = p[0]
    if ('th' or 'nd' or 'st' or 'rd' in x):
        x = x.replace('th', '')
        x = x.replace('nd', '')
        x = x.replace('st', '')
        x = x.replace('rd', '')
    return x

In [805]:
open_masc['21.1'] = open_masc['21.1'].apply(define_posicao)
open_masc['21.2'] = open_masc['21.2'].apply(define_posicao)
open_masc['21.3'] = open_masc['21.3'].apply(define_posicao)
open_masc['21.4'] = open_masc['21.4'].apply(define_posicao)
open_masc['21.1'] = pd.to_numeric(open_masc['21.1'])
open_masc['21.2'] = pd.to_numeric(open_masc['21.2'])
open_masc['21.3'] = pd.to_numeric(open_masc['21.3'])
open_masc['21.4'] = pd.to_numeric(open_masc['21.4'])

### 3.5 - Transformando a coluna de peso levantado
Considerando unidade de medida Kg

In [806]:
open_masc.head()

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
0,0,JEFFRE,ADLER,1,Canada,North America,27,CrossFit Wonderland,101.0,20,8,27,46,Weight lifted: 317 lbs.,175.26,89.55
1,1,SCOTT,PANCHIK,2,United States,North America,33,CrossFit Mentality,141.0,33,47,5,56,Weight lifted: 316 lbs.,175.26,85.0
2,2,TRAVIS,MEAD,3,United States,North America,34,Iron Valley CrossFit,165.0,87,24,48,6,Weight lifted: 345 lbs.,185.42,93.18
3,3,SAXON,PANCHIK,4,United States,North America,25,CrossFit Cliffside,217.0,5,68,87,57,Weight lifted: 316 lbs.,175.26,81.82
4,4,RICHARD,FRONING JR.,5,United States,North America,33,CrossFit Mayhem,254.0,58,91,5,100,Weight lifted: 312 lbs.,175.26,88.18


In [807]:
# Preenchendo pesos nullos com valor zero, por que considera-se que o atleta não realizou a prova
open_masc['weight_lifted'] = open_masc['weight_lifted'].fillna('0')

In [808]:
open_masc.tail()

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
137459,137459,JACK,NORTON,137351,United Kingdom,Europe,31,,467118.0,117347,111374,101151,137246,0,,
137460,137460,JAMES,DOUGHTY,137351,United States,North America,32,,467118.0,117347,111374,101151,137246,0,,
137461,137461,CHAD,ROTT,137351,United States,North America,46,Polarize CrossFit,467118.0,117347,111374,101151,137246,0,,
137462,137462,NATE,RICHARDS,137351,United States,North America,39,CrossFit Home Office Scotts Valley,467118.0,117347,111374,101151,137246,0,193.04,93.18
137463,137463,ANDREW,WEINSTEIN,137351,United States,North America,51,,467118.0,117347,111374,101151,137246,0,,


In [809]:
# Iniciando o processo para converter weight_lift para numerico
# Função para remover a string 'lbs' e converter peso para quilos
def substitui_texto(value):
    value = value.replace('Weight lifted: ', '')
    if ('l' in value):
        x = value.replace(' lbs', '')
    elif ('k' in value):
        x = value.replace(' kg', '')
    else:
        x = value
    x = float(x) / 2.205
    
    return(round(x,2))

In [810]:
#Funaçao para substituir valores inválidos por peso zero
def remove_peso_invalido(value):
    if ('(' in value):
        value = '0'
    return value

In [811]:
open_masc['weight_lifted'] = open_masc['weight_lifted'].apply(remove_peso_invalido)

In [812]:
open_masc['weight_lifted'] = open_masc['weight_lifted'].apply(substitui_texto)


In [813]:
# Convertendo a coluna weight lifted de 'string' para numérico
open_masc['weight_lifted'] = pd.to_numeric(open_masc['weight_lifted'])

In [814]:
open_masc['weight_lifted']

0         143.76
1         143.31
2         156.46
3         143.31
4         141.50
           ...  
137459      0.00
137460      0.00
137461      0.00
137462      0.00
137463      0.00
Name: weight_lifted, Length: 137463, dtype: float64

In [815]:
import numpy as np

In [816]:
c = max(open_masc['weight_lifted'])
index = np.where(open_masc['weight_lifted'] == c)
outlier = open_masc.iloc[index]

In [817]:
print("Nota: O atleta", open_masc['index'].iloc[100332], "apresenta dados inconsistentes referentes ao seu levantamento de peso")
display(outlier)


Nota: O atleta 100333 apresenta dados inconsistentes referentes ao seu levantamento de peso


Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
100332,100332,BRUNO,NUNES,100333,France,Europe,46,CrossFit Chelles,379889.0,102482,100378,92293,84736,1666.21,,


## 4 - Preenchendo missings
Preencher strings que estão faltando (como nome, país de origem, afiliação, etc). Dadas as características dos dados, optou-se por fazer o preenchimento dos campos que estão vazios ou nullos. Por exemplo, nas colunas referentes ao resultado de uma determinada prova em específico (como a 21.3 ou 21.2) o fato de não ter sido atribuído um valor, indica que o atleta não realizou a prova, portanto atribuiu-se, conforme conveniente, valor zero. Na coluna de afiliação os missings foram tratados como "não afiliados", pois entende-se que se tratam de atletas que treinam em academias próprias ou não afiliada à Crossfit porém que obtiveram o direito de validar suas provas em local licenciado.

In [818]:
# Conferindo os continentes listados, 
continent = np.array(open_masc['continent'])
unique_continent = np.unique(continent)
print(unique_continent)

['Africa' 'Asia' 'Europe' 'North America' 'Oceania' 'South America']


In [819]:
# Visualizando todos os países listados
country = np.array(open_masc['country'])
unique_country = np.unique(country)
print("Total de países participantes: ", len(unique_country))
print(unique_country)

Total de países participantes:  156
['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Argentina'
 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain'
 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Cambodia' 'Cameroon' 'Canada' 'Cape Verde' 'Chile' 'China'
 'Colombia' 'Congo, The Democratic Republic of the' 'Costa Rica' 'Croatia'
 'Cyprus' 'Czech Republic' "Côte d'Ivoire" 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Estonia' 'Ethiopia'
 'Fiji' 'Finland' 'France' 'Georgia' 'Germany' 'Ghana' 'Greece'
 'Guatemala' 'Guinea' 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland'
 'India' 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica'
 'Japan' 'Jordan' 'Kazakhstan' 'Kenya' 'Korea, Republic of' 'Kosovo'
 'Kuwait' 'Kyrgyzstan' 'Laos' 'Latvia' 'Lebanon' 'Libya' 'Liechtenstein'
 'Lithuania' 'Luxembourg' 'Macedonia' 'Madagascar

In [820]:
# Verificando onde estão os dados nulos
open_masc.isnull().sum()

index                0
first_name           1
lastname             7
placing              0
country              0
continent            0
age                  0
affiliate        24783
points               0
21.1                 0
21.2                 0
21.3                 0
21.4                 0
weight_lifted        0
height           55349
weight           50962
dtype: int64

A coluna 'first_name' se refere ao primeiro nome do atleta conforme cadastrado no site da Crossfit Games, considerando que apenas um atleta apresenta missing nessa colunas e o atleta possui todos os outros dados consistentes, optou-se por atribuir o valor '-' à essa variável faltante.

A coluna 'last_name' se refere ao sobrenome do atleta conforme cadastrado no site da Crossfit Games, considerando que todos os atletas que não possuem sobrenome cadastrado possuem todos os outros valores consistentes optou-se por preservá-los e atribuir o valor '-' à essa variável faltante.

Os atletas não afiliados recebrão o valor 'not_affiliate' na coluna respectiva

Nota-se que existe um número considerável de atletas cujos dados de peso e altura não foram informados, visando resguardar os outros dados referentes aos resultados dos atletas optou-se por atribuir o valor "uninformed" aos respectivos casos.

In [821]:
# Encontrando quem tem first_name nulo
open_masc[pd.isna(open_masc['first_name'])]

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
93138,93138,,YEONGDON,93139,"Korea, Republic of",Asia,28,CrossFit 660,359096.0,56467,95714,101151,105764,0.0,,


In [822]:
# Encontrando quem tem lastname nulo
open_masc[pd.isna(open_masc['lastname'])]

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
2040,2040,HYEONJONG,,2041,"Korea, Republic of",Asia,31,CrossFit Zest,17676.0,3890,3593,5047,5146,114.29,,78.18
7732,7732,YOUNGSEON,,7733,"Korea, Republic of",Asia,23,Golden Crown CrossFit Seodaemun,48734.0,5280,10317,10886,22251,97.51,,
19017,19017,TAKSEOK,,19018,"Korea, Republic of",Asia,41,CrossFit Geumchon,96625.0,11332,40439,22176,22678,97.51,170.0,75.0
73598,73598,MICHAEL,,73599,Canada,North America,51,CrossFit Calgary,290309.0,66960,86614,77177,59558,79.37,,
81080,81080,JONGMYUNG,,81081,"Korea, Republic of",Asia,28,CrossFit Lagom,315173.0,81668,58866,85707,88932,70.29,,
86324,86324,DAVID,,86325,United States,North America,37,CrossFit East Oahu,333850.0,87458,92161,78825,75406,66.67,177.8,88.64
133060,133060,BRIAN,,132987,United States,North America,33,,459276.0,117347,111374,101151,129404,0.0,175.26,75.0


In [823]:
# Incluindo texto nos campos nulos de acordo com a coluna
values = {'lastname': '-', 'first_name': '-', 'affiliate': 'not_affiliate', 'height': 'uninformed', 'weight': 'uninformed'}
open_masc.fillna(value = values, inplace=True) 

### 4.1 - Convertendo colunas de dados categóricos
e o indíce em String

In [824]:
open_masc['continent'] = open_masc['continent'].astype("category")
open_masc['country'] = open_masc['country'].astype("category")
open_masc['affiliate'] = open_masc['affiliate'].astype("category")

In [825]:
open_masc['index'] = open_masc['index'].astype("str")

## 5 - Verificando a qualidade dos dados


In [826]:
open_masc.isna().sum()
# Nenhum valor null

index            0
first_name       0
lastname         0
placing          0
country          0
continent        0
age              0
affiliate        0
points           0
21.1             0
21.2             0
21.3             0
21.4             0
weight_lifted    0
height           0
weight           0
dtype: int64

In [827]:
# Verificando consistencia dos dados - INCONSISTENTE
# peso_maximo = open_masc['weight_lifted'].max()
# peso_maximo

In [828]:
dfSummary(open_masc)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,index [object],1. 0 2. 91638 3. 91652 4. 91651 5. 91650 6. 91649 7. 91648 8. 91647 9. 91646 10. 91645 11. other,"1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 137,453 (100.0%)",,0 (0.0%)
2,first_name [object],1. MICHAEL 2. DAVID 3. DANIEL 4. CHRIS 5. JOHN 6. RYAN 7. JASON 8. MATTHEW 9. ANDREW 10. JAMES 11. other,"2,226 (1.6%) 1,995 (1.5%) 1,575 (1.1%) 1,486 (1.1%) 1,385 (1.0%) 1,365 (1.0%) 1,278 (0.9%) 1,246 (0.9%) 1,224 (0.9%) 1,184 (0.9%) 122,499 (89.1%)",,0 (0.0%)
3,lastname [object],1. SMITH 2. LEE 3. JOHNSON 4. JONES 5. KIM 6. BROWN 7. WILLIAMS 8. GARCIA 9. MILLER 10. WILSON 11. other,"713 (0.5%) 394 (0.3%) 392 (0.3%) 388 (0.3%) 384 (0.3%) 368 (0.3%) 330 (0.2%) 318 (0.2%) 312 (0.2%) 271 (0.2%) 133,593 (97.2%)",,0 (0.0%)
4,placing [object],1. 135018 2. 132082 3. 121767 4. 127322 5. 125679 6. 134099 7. 136916 8. 128150 9. 123553 10. 131081 11. other,"852 (0.6%) 595 (0.4%) 591 (0.4%) 450 (0.3%) 401 (0.3%) 380 (0.3%) 378 (0.3%) 378 (0.3%) 363 (0.3%) 353 (0.3%) 132,722 (96.6%)",,0 (0.0%)
5,country [category],1. United States 2. Australia 3. United Kingdom 4. France 5. Canada 6. Brazil 7. Spain 8. South Africa 9. Germany 10. Sweden 11. other,"65,145 (47.4%) 8,206 (6.0%) 6,080 (4.4%) 6,038 (4.4%) 5,321 (3.9%) 4,994 (3.6%) 4,844 (3.5%) 3,152 (2.3%) 2,705 (2.0%) 2,138 (1.6%) 28,840 (21.0%)",,0 (0.0%)
6,continent [category],1. North America 2. Europe 3. Oceania 4. Asia 5. South America 6. Africa,"73,659 (53.6%) 34,317 (25.0%) 9,930 (7.2%) 8,194 (6.0%) 7,076 (5.1%) 4,287 (3.1%)",,0 (0.0%)
7,age [int64],Mean (sd) : 34.7 (8.3) min < med < max: 16.0 < 34.0 < 54.0 IQR (CV) : 12.0 (4.2),39 distinct values,,0 (0.0%)
8,affiliate [category],1. not_affiliate 2. CrossFit Coraje 3. CrossFit 1530 4. P1 CrossFit 5. CrossFit Torian 6. CrossFit Linchpin 7. CrossFit Reykjavík 8. CrossFit 19.05 9. Plus64 CrossFit 10. FreeMove CrossFit 11. other,"24,783 (18.0%) 156 (0.1%) 127 (0.1%) 124 (0.1%) 121 (0.1%) 118 (0.1%) 117 (0.1%) 111 (0.1%) 109 (0.1%) 108 (0.1%) 111,589 (81.2%)",,0 (0.0%)
9,points [float64],Mean (sd) : 266044.7 (132747.2) min < med < max: 101.0 < 274493.0 < 467118.0 IQR (CV) : 231839.5 (2.0),"106,495 distinct values",,0 (0.0%)
10,21.1 [int64],Mean (sd) : 67253.6 (37547.4) min < med < max: 1.0 < 68727.0 < 117347.0 IQR (CV) : 68729.5 (1.8),"44,704 distinct values",,0 (0.0%)


In [840]:
#Salvando o dataframe para análise
open_masc.to_csv("data/crossfit_open_masculino_2021.csv")