Adicionando o banco

In [None]:
import pandas as pd
import numpy as np

# Carregar o CSV
raw_path = "kendo_matches_work.csv"
df = pd.read_csv(raw_path)

print("Shape original:", df.shape)
display(df.head())

In [None]:
# Remover coluna-índice exportada por engano
idx_cols = [c for c in df.columns if c.lower().startswith("unnamed")]
if idx_cols:
    df = df.drop(columns=idx_cols)
    print(f"Removidas colunas de índice exportado: {idx_cols}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a boxplot for the 'seconds_between' column
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['seconds_between'])
plt.title('Boxplot of Seconds Between Ippons')
plt.xlabel('Seconds')
plt.show()

In [None]:
# Padronização de tipos / mapeamento de valores
# Colunas esperadas
expected_cols = [
    "match_id", "ippon_number", "seconds_between", "ippon_taken",
    "men", "kote", "do", "tsuki"
]

# Verificar se todas existem (se faltar, acuse para evitar erro silencioso)
missing = [c for c in expected_cols if c not in df.columns]
if missing:
    raise ValueError(f"Colunas ausentes no CSV: {missing}")

In [None]:
# Remover espaços em branco em strings
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype(str).str.strip()

In [None]:
# Mapeamento de strings comuns para binário
# (O/o -> 0, I/l/| -> 1, yes/true/y -> 1, no/false/n -> 0)
binary_map = {
    "o": 0, "O": 0, "0": 0, "no": 0, "false": 0, "n": 0,
    "i": 1, "I": 1, "l": 0, "|": 1, "1": 1, "yes": 1, "true": 1, "y": 1
}


In [None]:
bin_cols = ["men", "kote", "do", "tsuki", "ippon_taken"]

for col in bin_cols:
    # aplica mapeamento em valores de texto conhecidos
    df[col] = df[col].replace(binary_map)
    # convert to Int64 after replacement, coercing errors to NaN
    df[col] = pd.to_numeric(df[col], errors="coerce").astype('Int64')

In [None]:
# Convert all columns to numeric first, coercing errors to NaN
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Now convert to Int64, which can handle NaNs
for col in df.columns:
    df[col] = df[col].astype('Int64')

In [None]:
# Validar campos binários (0/1)
# Qualquer valor que não seja 0 ou 1 nas colunas binárias será considerado erro -> NaN
for col in bin_cols:
    df.loc[~df[col].isin([0, 1]), col] = np.nan

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Group by match_id and sum the strike counts
strike_summary = df.groupby('match_id')[['men', 'kote', 'do', 'tsuki']].sum()

# Create a stacked bar plot
strike_summary.plot(kind='bar', stacked=True, figsize=(15, 7))

plt.title('Distribution of Strikes per Match')
plt.xlabel('Match ID')
plt.ylabel('Number of Strikes')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Dropar linhas que possuem seconds_between maiores que 300s (tempo de partida)
before_drop = df.shape[0]
df = df[df['seconds_between'] <= 300].copy()
after_drop = df.shape[0]

print(f"Número de linhas antes de remover: {before_drop}")
print(f"Número de linhas depois de remover: {after_drop}")

In [None]:
# Dropar linhas duplicadas
dup_before = df.shape[0]
df = df.drop_duplicates().copy()
dup_after = df.shape[0]

print(f"Número de linhas antes de dropar duplicadas: {dup_before}")
print(f"Número de linhas depois de dropar duplicadas: {dup_after}")

In [None]:
# Retirar partidas que não possuem valor verdadeiro nos golpes
# Criar uma máscara para identificar linhas onde a soma das colunas de golpe é 0 (ou NaN)
strike_cols = ["men", "kote", "do", "tsuki"]
# Check if the sum of strikes is 0 or NaN (assuming 1 for a valid strike)
mask_no_strike = (df[strike_cols].sum(axis=1) == 0) | (df[strike_cols].sum(axis=1).isna())

# Get the number of rows before dropping
before_drop = df.shape[0]

# Drop the rows where the mask is True
df = df[~mask_no_strike].copy()

# Get the number of rows after dropping
after_drop = df.shape[0]

print(f"Número de linhas antes de remover: {before_drop}")
print(f"Número de linhas depois de remover: {after_drop}")

display(df.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Group by match_id and sum the strike counts
strike_summary = df.groupby('match_id')[['men', 'kote', 'do', 'tsuki']].sum()

# Create a stacked bar plot
strike_summary.plot(kind='bar', stacked=True, figsize=(15, 7))

plt.title('Distribution of Strikes per Match')
plt.xlabel('Match ID')
plt.ylabel('Number of Strikes')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Salvar CSV limpo
clean_path = "kendo_matches_TRATADOS.csv"
df.to_csv(clean_path, index=False)
print(f"\nArquivo salvo em: {clean_path}")