# Limpeza e preparação de dados para análise

Este notebook tem a por finalidade a preparação de um DataSet para posterior análise de dados.

In [1]:
# importando bibliotecas
import pandas as pd
import numpy as np
from summarytools import dfSummary

## 1 - Carregando os dados

In [2]:
# Lendo os dados
open_masc = pd.read_csv('data/women_open_21.1.csv', sep=',', low_memory=False)

## 2 - Visualizando dados
Essa visualização é importante para compreender o que precisa ser feito no dataset.

Verifica-se que algumas colunas de valores estão em string e precisam ser convertidos para numéricos para possibilitar que os calculos sejam feitos.

Há uma coluna que condensa dois dados, peso e altura do atleta, deverá ser convertida em duas colunas.

Há outras colunas que condensam duas informações, por exemplo, a coluna '21.1' armazena dados da posição do atleta naquela prova e também do tempo em que o atleta concluiu a prova. *Vamos ficar apenas com a posição do atleta na prova*.


In [3]:
# Primeira visualização do Dataframe
open_masc.head()

Unnamed: 0.1,Unnamed: 0,first_name,lastname,Placing,country,continent,Age,affiliate,Height and Weight,points,21.1,21.1 time,21.2,21.2 time,21.3,21.3 time,21.4,weight
0,0,TIA-CLAIR,TOOMEY,1,Australia,Oceania,Age 27,CrossFit East Nashville,163 cm | 58 kg,26,6th (11:21),605 reps,11th (9:26),225 reps,1st (7:37),180 reps,8th (230 lbs),Weight lifted: 230 lbs.
1,1,EMMA,CARY,2,United States,North America,Age 17,Perform Overcome Excel CrossFit,64 in | 140 lb,66,14th (11:48),605 reps,1st (8:51),225 reps,14th (8:28),180 reps,37th (218 lbs),Weight lifted: 218 lbs.
2,2,KARA,SAUNDERS,3,Australia,Oceania,Age 31,CrossFit Carv,162 cm | 158 lb,111,63rd (13:11),605 reps,23rd (9:42),225 reps,20th (8:41),180 reps,5th (232 lbs),Weight lifted: 232 lbs.
3,3,KRISTI ERAMO,O'CONNELL,4,United States,North America,Age 32,CrossFit Polaris,62 in | 132 lb,189,121st (13:47),605 reps,21st (9:41),225 reps,4th (7:47),180 reps,43rd (217 lbs),Weight lifted: 217 lbs.
4,4,MALLORY,O'BRIEN,5,United States,North America,Age 17,Lion Brave CrossFit,63 in | 145 lb,211,4th (11:06),605 reps,163rd (10:38),225 reps,2nd (7:38),180 reps,42nd (217 lbs),Weight lifted: 217 lbs.


In [4]:
# Verificando o "tamanho" do df
open_masc.shape

(108600, 18)

In [5]:
open_masc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108600 entries, 0 to 108599
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Unnamed: 0         108600 non-null  int64 
 1   first_name         108599 non-null  object
 2   lastname           108597 non-null  object
 3   Placing            108600 non-null  int64 
 4   country            108600 non-null  object
 5   continent          108600 non-null  object
 6   Age                108600 non-null  object
 7   affiliate          93538 non-null   object
 8   Height and Weight  50325 non-null   object
 9   points             108600 non-null  int64 
 10  21.1               108600 non-null  object
 11  21.1 time          8349 non-null    object
 12  21.2               108600 non-null  object
 13  21.2 time          50690 non-null   object
 14  21.3               108600 non-null  object
 15  21.3 time          8209 non-null    object
 16  21.4               1

In [6]:
# Primeira descrição dos dados numéricos, 
open_masc.describe()

Unnamed: 0.1,Unnamed: 0,Placing,points
count,108600.0,108600.0,108600.0
mean,54299.5,54289.647459,211080.913444
std,31350.263954,31334.182357,107166.944901
min,0.0,1.0,26.0
25%,27149.75,27150.75,120400.75
50%,54299.5,54300.5,224044.5
75%,81449.25,81450.25,303278.25
max,108599.0,108565.0,374151.0


In [7]:
# Verificando quais são as colunas originais do df
open_masc.columns

Index(['Unnamed: 0', 'first_name', 'lastname', 'Placing', 'country',
       'continent', 'Age', 'affiliate', 'Height and Weight', 'points', '21.1',
       '21.1 time', '21.2', '21.2 time', '21.3', '21.3 time', '21.4',
       'weight'],
      dtype='object')

## 3 - Tratando colunas

### 3.1 - Renomeando as colunas

In [8]:
open_masc =  open_masc.rename(columns={'Unnamed: 0': 'index', 'weight': 'weight_lifted', 'Age': 'age', 'Placing':'placing'})


### 3.2 - Separando as colunas de altura e peso do atleta
Ambas as colunas são dados numéricos.

Unidade de de medidas: peso(Kg), altura(cm)

#### Altura do atleta

In [9]:
open_masc['height'] = open_masc['Height and Weight']

In [10]:
def define_height(value):
    x = None
    if type(value) == str:
        a = value.split()
        if a[1] == 'in':
            x = int(a[0]) * 2.54
            x = round(x,2)
        elif a[1] == 'cm':
            x = int(a[0])
            x = round(x,2)
    return x

In [11]:
open_masc['height'] = open_masc['height'].apply(define_height)

#### Peso do atleta

In [12]:
open_masc['weight'] = open_masc['Height and Weight']

In [13]:
def define_weight(value):
    x = None
    if type(value) == str:
        a = value.split()
        if a[-1] == 'lb':
            x = int(a[-2]) / 2.2
            x = round(x,2)
        elif a[-1] == 'kg':
            x = int(a[-2])
            x = round(x,2)
    return x

In [14]:
open_masc['weight'] = open_masc['weight'].apply(define_weight)

Removendo a coluna de peso e altura 

In [15]:
open_masc.drop(['Height and Weight'], axis=1, inplace=True)

In [16]:
open_masc.head()

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.1 time,21.2,21.2 time,21.3,21.3 time,21.4,weight_lifted,height,weight
0,0,TIA-CLAIR,TOOMEY,1,Australia,Oceania,Age 27,CrossFit East Nashville,26,6th (11:21),605 reps,11th (9:26),225 reps,1st (7:37),180 reps,8th (230 lbs),Weight lifted: 230 lbs.,163.0,58.0
1,1,EMMA,CARY,2,United States,North America,Age 17,Perform Overcome Excel CrossFit,66,14th (11:48),605 reps,1st (8:51),225 reps,14th (8:28),180 reps,37th (218 lbs),Weight lifted: 218 lbs.,162.56,63.64
2,2,KARA,SAUNDERS,3,Australia,Oceania,Age 31,CrossFit Carv,111,63rd (13:11),605 reps,23rd (9:42),225 reps,20th (8:41),180 reps,5th (232 lbs),Weight lifted: 232 lbs.,162.0,71.82
3,3,KRISTI ERAMO,O'CONNELL,4,United States,North America,Age 32,CrossFit Polaris,189,121st (13:47),605 reps,21st (9:41),225 reps,4th (7:47),180 reps,43rd (217 lbs),Weight lifted: 217 lbs.,157.48,60.0
4,4,MALLORY,O'BRIEN,5,United States,North America,Age 17,Lion Brave CrossFit,211,4th (11:06),605 reps,163rd (10:38),225 reps,2nd (7:38),180 reps,42nd (217 lbs),Weight lifted: 217 lbs.,160.02,65.91


### 3.3 - Transformando a coluna de idade em numérico

Durante o processo para realizar a transformação da coluna idade (de string para numérico) observou-se inconsistência nos dados da linha 57645, optando-se pela remoção da mesma

In [17]:
open_masc.iloc[57645]

index                             57645
first_name                      JESSICA
lastname                        BALLARD
placing                           57646
country                       Australia
continent                       Oceania
age                              Age 30
affiliate            CrossFit Dauntless
points                           237715
21.1                 43056th (179 reps)
21.1 time                           NaN
21.2                64541st (14:46 - s)
21.2 time                      225 reps
21.3             59769th (135 reps - s)
21.3 time                           NaN
21.4               70349th (84 lbs - s)
weight_lifted    Weight lifted: 84 lbs.
height                              NaN
weight                              NaN
Name: 57645, dtype: object

In [18]:
open_masc.drop(labels=57645, axis=0, inplace=True, errors='raise')

In [19]:
def transforma_idade(texto):
    if texto == "":
        x = 0
    else:
        texto.split(" ")
        x = texto[1]
    return(texto[-2:])

In [20]:
open_masc['age'] = open_masc['age'].apply(transforma_idade)


In [21]:
# Convertendo a coluna age (idade) de 'string' para numérico
open_masc['age'] = pd.to_numeric(open_masc['age'])


#### 3.4 - Tratando os resultados das provas 

In [22]:
open_masc = open_masc.drop(labels='21.1 time', axis=1)
open_masc = open_masc.drop(labels='21.2 time', axis=1)
open_masc = open_masc.drop(labels='21.3 time', axis=1)
open_masc.columns

Index(['index', 'first_name', 'lastname', 'placing', 'country', 'continent',
       'age', 'affiliate', 'points', '21.1', '21.2', '21.3', '21.4',
       'weight_lifted', 'height', 'weight'],
      dtype='object')

In [23]:
# Incluindo texto nos campos nulos de acordo com a coluna
values = {'21.1': '0', '21.2': '0', '21.3': '0', '21.4': '0'}
open_masc.fillna(value = values, inplace=True) 

In [24]:
open_masc.head()

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
0,0,TIA-CLAIR,TOOMEY,1,Australia,Oceania,27,CrossFit East Nashville,26,6th (11:21),11th (9:26),1st (7:37),8th (230 lbs),Weight lifted: 230 lbs.,163.0,58.0
1,1,EMMA,CARY,2,United States,North America,17,Perform Overcome Excel CrossFit,66,14th (11:48),1st (8:51),14th (8:28),37th (218 lbs),Weight lifted: 218 lbs.,162.56,63.64
2,2,KARA,SAUNDERS,3,Australia,Oceania,31,CrossFit Carv,111,63rd (13:11),23rd (9:42),20th (8:41),5th (232 lbs),Weight lifted: 232 lbs.,162.0,71.82
3,3,KRISTI ERAMO,O'CONNELL,4,United States,North America,32,CrossFit Polaris,189,121st (13:47),21st (9:41),4th (7:47),43rd (217 lbs),Weight lifted: 217 lbs.,157.48,60.0
4,4,MALLORY,O'BRIEN,5,United States,North America,17,Lion Brave CrossFit,211,4th (11:06),163rd (10:38),2nd (7:38),42nd (217 lbs),Weight lifted: 217 lbs.,160.02,65.91


In [25]:
def define_posicao(p):
    p = p.split(" ")
    x = p[0].strip()
    if ('th' or 'nd' or 'st' or 'rd' in x):
        x = x.replace('th', '')
        x = x.replace('nd', '')
        x = x.replace('st', '')
        x = x.replace('rd', '')
    return x

In [26]:
open_masc['21.1'] = open_masc['21.1'].apply(define_posicao)
open_masc['21.2'] = open_masc['21.2'].apply(define_posicao)
open_masc['21.3'] = open_masc['21.3'].apply(define_posicao)
open_masc['21.4'] = open_masc['21.4'].apply(define_posicao)
open_masc['21.1'] = pd.to_numeric(open_masc['21.1'])
open_masc['21.2'] = pd.to_numeric(open_masc['21.2'])
open_masc['21.3'] = pd.to_numeric(open_masc['21.3'])
open_masc['21.4'] = pd.to_numeric(open_masc['21.4'])

### 3.5 - Transformando a coluna de peso levantado
Considerando unidade de medida Kg

In [27]:
open_masc.head()

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
0,0,TIA-CLAIR,TOOMEY,1,Australia,Oceania,27,CrossFit East Nashville,26,6,11,1,8.0,Weight lifted: 230 lbs.,163.0,58.0
1,1,EMMA,CARY,2,United States,North America,17,Perform Overcome Excel CrossFit,66,14,1,14,37.0,Weight lifted: 218 lbs.,162.56,63.64
2,2,KARA,SAUNDERS,3,Australia,Oceania,31,CrossFit Carv,111,63,23,20,5.0,Weight lifted: 232 lbs.,162.0,71.82
3,3,KRISTI ERAMO,O'CONNELL,4,United States,North America,32,CrossFit Polaris,189,121,21,4,43.0,Weight lifted: 217 lbs.,157.48,60.0
4,4,MALLORY,O'BRIEN,5,United States,North America,17,Lion Brave CrossFit,211,4,163,2,42.0,Weight lifted: 217 lbs.,160.02,65.91


In [28]:
# Preenchendo pesos nullos com valor zero, por que considera-se que o atleta não realizou a prova
open_masc['weight_lifted'] = open_masc['weight_lifted'].fillna('0')

In [29]:
open_masc.tail()

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
108595,108595,GEORGINA,NOELLEN,108565,Germany,Europe,32,CrossFit Neuss,374151,94308,90537,80806,108500.0,0,166.0,54.0
108596,108596,STACEY,MAIDA,108565,United States,North America,36,CrossFit Five Points,374151,94308,90537,80806,108500.0,0,165.1,78.18
108597,108597,FIONA,STEEL,108565,United Kingdom,Europe,31,,374151,94308,90537,80806,108500.0,0,,
108598,108598,RACHAEL,BUTLER,108565,Canada,North America,34,,374151,94308,90537,80806,108500.0,0,,59.09
108599,108599,VALERIA,MONTERO,108565,United States,North America,21,,374151,94308,90537,80806,108500.0,0,152.4,


In [30]:
# Iniciando o processo para converter weight_lift para numerico
# Função para remover a string 'lbs' e converter peso para quilos
def substitui_texto(value):
    value = value.replace('Weight lifted: ', '')
    if ('l' in value):
        x = value.replace(' lbs', '')
    elif ('k' in value):
        x = value.replace(' kg', '')
    else:
        x = value
    x = float(x) / 2.205
    
    return(round(x,2))

In [31]:
#Funaçao para substituir valores inválidos por peso zero
def remove_peso_invalido(value):
    if ('(' in value):
        value = '0'
    return value

In [32]:
open_masc['weight_lifted'] = open_masc['weight_lifted'].apply(remove_peso_invalido)

In [33]:
open_masc['weight_lifted'] = open_masc['weight_lifted'].apply(substitui_texto)


In [34]:
# Convertendo a coluna weight lifted de 'string' para numérico
open_masc['weight_lifted'] = pd.to_numeric(open_masc['weight_lifted'])

In [35]:
open_masc['weight_lifted']

0         104.31
1          98.87
2         105.22
3          98.41
4          98.41
           ...  
108595      0.00
108596      0.00
108597      0.00
108598      0.00
108599      0.00
Name: weight_lifted, Length: 108599, dtype: float64

In [36]:
import numpy as np

In [37]:
c = max(open_masc['weight_lifted'])
index = np.where(open_masc['weight_lifted'] == c)
outlier = open_masc.iloc[index]

In [38]:
print("Nota: O atleta", open_masc['index'].iloc[100332], "apresenta dados inconsistentes referentes ao seu levantamento de peso")
display(outlier)


Nota: O atleta 100333 apresenta dados inconsistentes referentes ao seu levantamento de peso


Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
83957,83957,NADA,ELARABY,83958,Egypt,Africa,31,,309900,92052,90537,74680,52631.0,2501.59,,


## 4 - Preenchendo missings
Preencher strings que estão faltando (como nome, país de origem, afiliação, etc). Dadas as características dos dados, optou-se por fazer o preenchimento dos campos que estão vazios ou nullos. Por exemplo, nas colunas referentes ao resultado de uma determinada prova em específico (como a 21.3 ou 21.2) o fato de não ter sido atribuído um valor, indica que o atleta não realizou a prova, portanto atribuiu-se, conforme conveniente, valor zero. Na coluna de afiliação os missings foram tratados como "não afiliados", pois entende-se que se tratam de atletas que treinam em academias próprias ou não afiliada à Crossfit porém que obtiveram o direito de validar suas provas em local licenciado.

In [39]:
# Conferindo os continentes listados, 
continent = np.array(open_masc['continent'])
unique_continent = np.unique(continent)
print(unique_continent)

['Africa' 'Asia' 'Europe' 'North America' 'Oceania' 'South America']


In [40]:
# Visualizando todos os países listados
country = np.array(open_masc['country'])
unique_country = np.unique(country)
print("Total de países participantes: ", len(unique_country))
print(unique_country)

Total de países participantes:  145
['Albania' 'Algeria' 'Andorra' 'Angola' 'Antigua and Barbuda' 'Argentina'
 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain'
 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Brunei Darussalam'
 'Bulgaria' 'Cambodia' 'Cameroon' 'Canada' 'Chile' 'China' 'Colombia'
 'Costa Rica' 'Croatia' 'Cyprus' 'Czech Republic' 'Denmark'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Estonia' 'Fiji'
 'Finland' 'France' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guyana' 'Haiti' 'Honduras' 'Hungary' 'Iceland' 'India'
 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan'
 'Jordan' 'Kazakhstan' 'Kenya' 'Korea, Republic of' 'Kosovo' 'Kuwait'
 'Latvia' 'Lebanon' 'Libya' 'Liechtenstein' 'Lithuania' 'Luxembourg'
 'Macedonia' 'Madagascar' 'Malaysia' 'Malta' 'Mauritius' 'Mexico'
 'Micronesia' 'Moldova' 'Mongolia' 'Montenegro' 'Morocco' 'Mozambique'


In [41]:
# Verificando onde estão os dados nulos
open_masc.isnull().sum()

index                0
first_name           1
lastname             3
placing              0
country              0
continent            0
age                  0
affiliate        15062
points               0
21.1                 0
21.2                 0
21.3                 0
21.4                 1
weight_lifted        0
height           62867
weight           62517
dtype: int64

A coluna 'first_name' se refere ao primeiro nome do atleta conforme cadastrado no site da Crossfit Games, considerando que apenas um atleta apresenta missing nessa colunas e o atleta possui todos os outros dados consistentes, optou-se por atribuir o valor '-' à essa variável faltante.

A coluna 'last_name' se refere ao sobrenome do atleta conforme cadastrado no site da Crossfit Games, considerando que todos os atletas que não possuem sobrenome cadastrado possuem todos os outros valores consistentes optou-se por preservá-los e atribuir o valor '-' à essa variável faltante.

Os atletas não afiliados recebrão o valor 'not_affiliate' na coluna respectiva

Nota-se que existe um número considerável de atletas cujos dados de peso e altura não foram informados, visando resguardar os outros dados referentes aos resultados dos atletas optou-se por atribuir o valor "uninformed" aos respectivos casos.

In [42]:
# Encontrando quem tem first_name nulo
open_masc[pd.isna(open_masc['first_name'])]

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
67999,67999,,ZHUANG,67999,China,Asia,41,CrossFit Wusi,269055,59389,74228,63733,71705.0,35.83,163.0,55.0


In [43]:
# Encontrando quem tem lastname nulo
open_masc[pd.isna(open_masc['lastname'])]

Unnamed: 0,index,first_name,lastname,placing,country,continent,age,affiliate,points,21.1,21.2,21.3,21.4,weight_lifted,height,weight
947,947,LEXIA,,948,United States,North America,29,CrossFit MKT,7966,48,187,1139,6592.0,71.2,,
32974,32974,HARA,,32975,"Korea, Republic of",Asia,34,CrossFit Apgujeong,141834,25958,48231,25658,41987.0,47.62,,
50854,50854,SAMANTHA,,50855,United States,North America,37,CrossFit MFP,209116,41081,48244,57770,62021.0,45.35,,


In [44]:
# Incluindo texto nos campos nulos de acordo com a coluna
values = {'lastname': '-', 'first_name': '-', 'affiliate': 'not_affiliate', 'height': 'uninformed', 'weight': 'uninformed'}
open_masc.fillna(value = values, inplace=True) 

### 4.1 - Convertendo colunas de dados categóricos
e o indíce em String

In [45]:
open_masc['continent'] = open_masc['continent'].astype("category")
open_masc['country'] = open_masc['country'].astype("category")
open_masc['affiliate'] = open_masc['affiliate'].astype("category")

In [46]:
open_masc['index'] = open_masc['index'].astype("str")

## 5 - Verificando a qualidade dos dados


In [47]:
open_masc.isna().sum()
# Nenhum valor null

index            0
first_name       0
lastname         0
placing          0
country          0
continent        0
age              0
affiliate        0
points           0
21.1             0
21.2             0
21.3             0
21.4             1
weight_lifted    0
height           0
weight           0
dtype: int64

In [48]:
# Verificando consistencia dos dados - INCONSISTENTE
# peso_maximo = open_masc['weight_lifted'].max()
# peso_maximo

In [49]:
dfSummary(open_masc)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,index [object],1. 0 2. 72409 3. 72407 4. 72406 5. 72405 6. 72404 7. 72403 8. 72402 9. 72401 10. 72400 11. other,"1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 108,589 (100.0%)",,0 (0.0%)
2,first_name [object],1. SARAH 2. JESSICA 3. JENNIFER 4. AMANDA 5. LAURA 6. NICOLE 7. ASHLEY 8. EMILY 9. MICHELLE 10. STEPHANIE 11. other,"1,436 (1.3%) 1,398 (1.3%) 1,149 (1.1%) 948 (0.9%) 926 (0.9%) 844 (0.8%) 844 (0.8%) 827 (0.8%) 824 (0.8%) 782 (0.7%) 98,621 (90.8%)",,0 (0.0%)
3,lastname [object],1. SMITH 2. JOHNSON 3. WILLIAMS 4. BROWN 5. JONES 6. MILLER 7. DAVIS 8. LEE 9. ANDERSON 10. WILSON 11. other,"632 (0.6%) 337 (0.3%) 326 (0.3%) 308 (0.3%) 307 (0.3%) 286 (0.3%) 230 (0.2%) 224 (0.2%) 215 (0.2%) 215 (0.2%) 105,519 (97.2%)",,0 (0.0%)
4,placing [int64],Mean (sd) : 54289.6 (31334.3) min < med < max: 1.0 < 54300.0 < 108565.0 IQR (CV) : 54300.0 (1.7),"85,320 distinct values",,0 (0.0%)
5,country [category],1. United States 2. Australia 3. Canada 4. United Kingdom 5. Brazil 6. France 7. South Africa 8. Spain 9. Germany 10. Sweden 11. other,"58,117 (53.5%) 7,452 (6.9%) 5,637 (5.2%) 4,886 (4.5%) 3,568 (3.3%) 2,944 (2.7%) 2,337 (2.2%) 2,153 (2.0%) 1,829 (1.7%) 1,802 (1.7%) 17,874 (16.5%)",,0 (0.0%)
6,continent [category],1. North America 2. Europe 3. Oceania 4. South America 5. Asia 6. Africa,"65,536 (60.3%) 22,399 (20.6%) 8,958 (8.2%) 4,570 (4.2%) 4,138 (3.8%) 2,998 (2.8%)",,0 (0.0%)
7,age [int64],Mean (sd) : 34.0 (8.5) min < med < max: 16.0 < 33.0 < 54.0 IQR (CV) : 12.0 (4.0),39 distinct values,,0 (0.0%)
8,affiliate [category],1. not_affiliate 2. Plus64 CrossFit 3. CrossFit Boynton Beach 4. CrossFit Reykjavík 5. CrossFit Blade 6. CrossFit 1530 7. CrossFit Fenton 8. P1 CrossFit 9. CrossFit Coraje 10. CrossFit Hendersonville 11. other,"15,062 (13.9%) 103 (0.1%) 102 (0.1%) 97 (0.1%) 93 (0.1%) 93 (0.1%) 92 (0.1%) 92 (0.1%) 85 (0.1%) 85 (0.1%) 92,695 (85.4%)",,0 (0.0%)
9,points [int64],Mean (sd) : 211080.7 (107167.4) min < med < max: 26.0 < 224031.0 < 374151.0 IQR (CV) : 182878.0 (2.0),"85,320 distinct values",,0 (0.0%)
10,21.1 [int64],Mean (sd) : 53355.3 (29962.5) min < med < max: 1.0 < 54301.0 < 94308.0 IQR (CV) : 54313.5 (1.8),"40,884 distinct values",,0 (0.0%)


In [50]:
#Salvando o dataframe para análise
open_masc.to_csv("data/crossfit_open_feminino_2021.csv", index=False)