In [None]:
# Importamos librerias necesarias
import charset_normalizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# import missingno as msno # Esta sirve para verificar valores faltantes / nulls

# Comprobar encoding de los dataset
with open('athletes.csv', 'rb') as rawdata:
    rawdata1 = charset_normalizer.detect(rawdata.read(10000))
print(rawdata1) # {'encoding': 'utf-8', 'language': 'English', 'confidence': 1.0}

with open('olympic_games_results.csv', 'rb') as rawdata:
    rawdata2 = charset_normalizer.detect(rawdata.read(10000))
print(rawdata2) # {'encoding': 'utf-8', 'language': 'French', 'confidence': 1.0}



In [None]:
# Cargamos las dos bases de datos en dataframes
athletes_df = pd.read_csv('athletes.csv', encoding='utf-8')
olympic_df = pd.read_csv('olympic_games_results.csv', encoding='utf-8')

athletes_df.shape # shape nos devuelve info sobre el nº de filas y columnas

In [None]:
olympic_df.shape 

In [None]:
# IMPORTANTE - Unimos ambas bases de datos en un solo dataframe

merged_df = pd.merge(olympic_df, athletes_df, on='athlete_id', how='inner')
merged_df.sample(10) # sample nos devuelve X registro del dataframe al azar

In [None]:
merged_df.head(10)

In [None]:
merged_df.shape

In [None]:
merged_df.describe()

In [None]:
merged_df.info()

In [None]:
print(merged_df.isnull().sum())

In [None]:
duplicated = merged_df.duplicated().sum()
print(duplicated)

In [None]:
var_num = merged_df.select_dtypes(exclude='object').columns.to_list()

var_cat = merged_df.select_dtypes(include='object').columns.to_list()

In [None]:
var_cat

In [None]:
compare_columns = merged_df['noc'].equals(merged_df['NOC'])
compare_columns

In [None]:
for col in merged_df.columns:
    print(f"{col}: {merged_df[col].unique()}\n\n")

In [None]:
merged_df.drop('NOC', axis=1, inplace=True)
merged_df.head()

In [None]:
merged_df.shape

# ANÁLISIS UNIDIMENSIONAL

## Unidimensional numérico

In [None]:
# valores colormap
TABLEAU_CMP = ('tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', \
               'tab:gray','tab:olive', 'tab:cyan')

In [None]:
# creamos la grafica
fig, axes = plt.subplots(len(var_num), 2, \
                         figsize=(20, 5 * len(var_num)), \
                         gridspec_kw={'hspace': 0.4, 'wspace': 0.1})
ax = axes.ravel()

# graficas distribucion y boxplot de cada atributo
for idx, atributo in enumerate(var_num):

    # distribucion (histograma)
    sns.distplot(merged_df[atributo], bins=30, ax=ax[2 * idx], \
                 color=TABLEAU_CMP[idx % len(TABLEAU_CMP)], \
                 hist_kws={'alpha': 0.15})

# titulo, etiquetas histograma
    ax[2 * idx].set_title(f'HISTOGRAMA {atributo}')
    ax[2 * idx].set_xlabel(f'Valores {atributo}')
    ax[2 * idx].set_ylabel("Frequencia")

    # boxplot
    sns.boxplot(x=atributo, data=merged_df, ax=ax[2 * idx + 1], color=TABLEAU_CMP[idx % len(TABLEAU_CMP)])

    # titulo, etiquetas boxplot
    ax[2 * idx + 1].set_title(f'BOXPLOT {atributo}')
    ax[2 * idx + 1].set_xlabel(f'Valores {atributo}')

## Univariado categórico

In [None]:
colores = sns.color_palette("husl", len(var_cat))

In [None]:
# creacion graficas
fig, axes = plt.subplots(len(var_cat), 1, \
                         figsize=(10, 5*len(var_cat)),\
                         gridspec_kw={'hspace': 0.4, 'wspace': 0.4})

ax = axes.ravel()

# dibujamos las graficas
for idx,variable in enumerate(var_cat):

# utilizar el método de dibujo que nos interese en cada momento
# sustituir nombre dataframe y parámetros según método de dibujo
    sns.countplot(merged_df[variable], ax=ax[idx],\
                  palette=colores)

    ax[idx].set_title(f'HISTOGRAMA {variable}')
    ax[idx].set_xlabel(f'Valores atributo {variable}')
    ax[idx].set_ylabel("Frequencia")