# Sumo Data Summary

This notebook summarizes the dataset `sumo_since_1957.csv`.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('sumo_since_1957.csv')
df.head()

In [None]:
# Dataset shape
df.shape

In [None]:
# Dataset info
df.info()

In [None]:
# Descriptive statistics for numerical columns
df.describe()

In [None]:
# Number of unique Rikishi and Heya
len(df['Rikishi'].unique()), len(df['Heya'].unique())

In [None]:
# Top 10 Rikishi by total wins
df.groupby('Rikishi')['wins'].sum().sort_values(ascending=False).head(10)

In [None]:
# Plot distributions of height and weight
fig, axes = plt.subplots(1,2, figsize=(12,5))
axes[0].hist(df['height_cm'].dropna(), bins=30, color='skyblue')
axes[0].set_title('Height Distribution')
axes[0].set_xlabel('Height (cm)')
axes[0].set_ylabel('Count')

axes[1].hist(df['weight_kg'].dropna(), bins=30, color='salmon')
axes[1].set_title('Weight Distribution')
axes[1].set_xlabel('Weight (kg)')
plt.tight_layout()
plt.show()

In [None]:
# Count of missing values per column
df.isna().sum()

In [None]:
# Correlation matrix for numeric columns
df[['height_cm', 'weight_kg', 'wins', 'losses', 'ties']].corr()

In [None]:
# Distribution of wins
plt.figure(figsize=(6,4))
plt.hist(df['wins'], bins=30, color='limegreen')
plt.title('Wins Distribution')
plt.xlabel('Wins')
plt.ylabel('Count')
plt.show()

In [None]:
# Top 10 ranks by frequency
(df['Rank'].value_counts().head(10)
 .plot(kind='bar', figsize=(8,4), color='orange', title='Top Ranks by Frequency'))
plt.xlabel('Rank')
plt.ylabel('Count')
plt.tight_layout()
plt.show()