# Sudoku Difficulty Classifier - Exploratory Data Analysis

This notebook explores the Sudoku dataset and engineered features to understand:
1. Data distribution across difficulty levels
2. Feature characteristics and correlations
3. Feature importance for classification
4. Data quality and potential issues

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

%matplotlib inline

## 1. Load Data

In [None]:
# Load feature data
train_df = pd.read_csv('../data/train_features.csv')
test_df = pd.read_csv('../data/test_features.csv')

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"\nFeatures: {train_df.shape[1] - 1}")  # Exclude target

# Display first few rows
train_df.head()

## 2. Target Variable Distribution

In [None]:
# Check class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training set
train_counts = train_df['difficulty'].value_counts().sort_index()
axes[0].bar(train_counts.index, train_counts.values, color=['green', 'orange', 'red'])
axes[0].set_title('Training Set - Difficulty Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Difficulty Level')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

# Add percentages
for i, (idx, val) in enumerate(train_counts.items()):
    pct = val / len(train_df) * 100
    axes[0].text(i, val + 5, f'{pct:.1f}%', ha='center', fontweight='bold')

# Test set
test_counts = test_df['difficulty'].value_counts().sort_index()
axes[1].bar(test_counts.index, test_counts.values, color=['green', 'orange', 'red'])
axes[1].set_title('Test Set - Difficulty Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Difficulty Level')
axes[1].set_ylabel('Count')
axes[1].grid(axis='y', alpha=0.3)

for i, (idx, val) in enumerate(test_counts.items()):
    pct = val / len(test_df) * 100
    axes[1].text(i, val + 2, f'{pct:.1f}%', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("Training set distribution:")
print(train_counts)
print("\nTest set distribution:")
print(test_counts)

## 3. Feature Statistics and Summary

In [None]:
# Basic statistics
train_df.describe()

In [None]:
# Check for missing values
print("Missing values:")
print(train_df.isnull().sum())

# Check data types
print("\nData types:")
print(train_df.dtypes)

## 4. Feature Distributions by Difficulty

In [None]:
# Select key features to visualize
key_features = [
    'empty_cells',
    'fill_ratio',
    'empty_per_row_std',
    'empty_per_box_std',
    'isolated_cells',
    'digit_entropy'
]

# Create box plots
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    train_df.boxplot(column=feature, by='difficulty', ax=axes[idx])
    axes[idx].set_title(f'{feature}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Difficulty')
    axes[idx].set_ylabel('Value')
    plt.sca(axes[idx])
    plt.xticks(rotation=0)

plt.suptitle('Feature Distributions by Difficulty Level', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 5. Feature Correlations

In [None]:
# Compute correlation matrix (excluding target)
feature_cols = train_df.columns[:-1]
corr_matrix = train_df[feature_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Find highly correlated features
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))

if high_corr_pairs:
    print("\nHighly correlated feature pairs (|r| > 0.8):")
    for f1, f2, corr in high_corr_pairs:
        print(f"  {f1} <-> {f2}: {corr:.3f}")
else:
    print("\nNo highly correlated feature pairs found (|r| > 0.8)")

## 6. Feature Distributions (Density Plots)

In [None]:
# Create density plots for key features
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    for difficulty in ['easy', 'medium', 'hard']:
        data = train_df[train_df['difficulty'] == difficulty][feature]
        axes[idx].hist(data, alpha=0.5, label=difficulty, bins=20, density=True)
    
    axes[idx].set_title(f'{feature}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Density')
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.suptitle('Feature Distributions by Difficulty (Histograms)', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 7. Pairwise Feature Relationships

In [None]:
# Select subset of features for pairplot
pairplot_features = ['empty_cells', 'isolated_cells', 'digit_entropy', 'difficulty']

# Create pairplot
sns.pairplot(train_df[pairplot_features], hue='difficulty', 
             palette={'easy': 'green', 'medium': 'orange', 'hard': 'red'},
             diag_kind='kde', height=2.5)
plt.suptitle('Pairwise Feature Relationships', y=1.02, fontsize=16, fontweight='bold')
plt.show()

## 8. Feature Separability Analysis

In [None]:
# Calculate mean values per difficulty for each feature
feature_means = train_df.groupby('difficulty')[feature_cols].mean()

print("Mean feature values by difficulty:")
print(feature_means.T)

# Calculate coefficient of variation for each feature across difficulties
cv_scores = {}
for feature in feature_cols:
    means = feature_means[feature].values
    cv = np.std(means) / np.mean(means) if np.mean(means) != 0 else 0
    cv_scores[feature] = cv

# Sort by CV (features with higher CV separate classes better)
cv_df = pd.DataFrame(list(cv_scores.items()), columns=['feature', 'cv_score'])
cv_df = cv_df.sort_values('cv_score', ascending=False)

print("\nFeatures ranked by separability (Coefficient of Variation):")
print(cv_df.head(10))

## 9. Data Quality Checks

In [None]:
# Check for outliers using IQR method
def count_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers)

outlier_counts = {}
for col in feature_cols:
    outlier_counts[col] = count_outliers(train_df, col)

outlier_df = pd.DataFrame(list(outlier_counts.items()), 
                          columns=['feature', 'outlier_count'])
outlier_df = outlier_df.sort_values('outlier_count', ascending=False)

print("Features with most outliers (IQR method):")
print(outlier_df.head(10))

## 10. Key Insights Summary

### From this EDA, we can observe:

1. **Class Balance**: The dataset has a relatively balanced distribution across easy, medium, and hard difficulties

2. **Feature Separability**: 
   - `empty_cells` and `fill_ratio` show clear separation between difficulty levels
   - `isolated_cells` increases with difficulty
   - Variance features (`empty_per_row_std`, `empty_per_box_std`) help distinguish puzzles

3. **Feature Correlations**:
   - Strong negative correlation between `empty_cells` and `filled_cells` (expected)
   - Mean/std features for rows, columns, and boxes are moderately correlated

4. **Data Quality**: 
   - No missing values
   - Some outliers present but this is expected in puzzle characteristics

5. **Feature Engineering Success**:
   - Engineered features show good discriminative power
   - Combination of basic counts, variance measures, and structural features provides rich representation