# Chapter 4: Feature Abstraction
## Tennis Analysis - Temporal Features and Pattern Detection

This notebook demonstrates the concepts from ML4QS Chapter 4 applied to tennis analysis:
- Temporal features: delta_y (velocity/change detection)
- Rolling window aggregation: mid_y_rolling_mean
- Pattern detection for ball hits using derivative analysis
- Converting pixel coordinates to real-world measurements
- Spatial feature extraction from court geometry

In [None]:
import sys
sys.path.append('../tennis_analysis-main')

import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from scipy.fft import fft, fftfreq
import seaborn as sns

## 1. Load Preprocessed Data

Load the preprocessed ball position data from Chapter 3.

In [None]:
# Load ball detection data
with open('../tennis_analysis-main/tracker_stubs/ball_detections.pkl', 'rb') as f:
    ball_detections = pickle.load(f)

# Create preprocessed dataset
ball_positions = [x.get(1,[]) for x in ball_detections]
df_ball = pd.DataFrame(ball_positions, columns=['x1','y1','x2','y2'])
df_ball = df_ball.interpolate().bfill()

# Calculate center coordinates
df_ball['center_x'] = (df_ball['x1'] + df_ball['x2']) / 2
df_ball['center_y'] = (df_ball['y1'] + df_ball['y2']) / 2
df_ball['mid_y'] = df_ball['center_y']  # For consistency with original analysis

print(f"Dataset shape: {df_ball.shape}")
print("\nFirst 5 rows:")
print(df_ball[['center_x', 'center_y', 'mid_y']].head())

## 2. Temporal Feature Extraction

Extract temporal features similar to ML4QS Chapter 4's TemporalAbstraction.

In [None]:
# Rolling window aggregation (similar to temporal abstraction)
window_size = 5
df_features = df_ball.copy()

# Rolling statistics
df_features['mid_y_rolling_mean'] = df_features['mid_y'].rolling(window=window_size, min_periods=1, center=False).mean()
df_features['mid_y_rolling_std'] = df_features['mid_y'].rolling(window=window_size, min_periods=1, center=False).std()
df_features['mid_y_rolling_min'] = df_features['mid_y'].rolling(window=window_size, min_periods=1, center=False).min()
df_features['mid_y_rolling_max'] = df_features['mid_y'].rolling(window=window_size, min_periods=1, center=False).max()

# Same for X coordinate
df_features['center_x_rolling_mean'] = df_features['center_x'].rolling(window=window_size, min_periods=1, center=False).mean()
df_features['center_x_rolling_std'] = df_features['center_x'].rolling(window=window_size, min_periods=1, center=False).std()

print("Rolling window features created:")
print(df_features[['mid_y', 'mid_y_rolling_mean', 'mid_y_rolling_std']].head(10))

## 3. Velocity and Acceleration Features (Delta Analysis)

Calculate first and second derivatives for motion analysis.

In [None]:
# First derivative (velocity)
df_features['delta_y'] = df_features['mid_y_rolling_mean'].diff()
df_features['delta_x'] = df_features['center_x_rolling_mean'].diff()

# Second derivative (acceleration)
df_features['accel_y'] = df_features['delta_y'].diff()
df_features['accel_x'] = df_features['delta_x'].diff()

# Speed and direction
df_features['speed'] = np.sqrt(df_features['delta_x']**2 + df_features['delta_y']**2)
df_features['direction'] = np.arctan2(df_features['delta_y'], df_features['delta_x'])

print("Velocity and acceleration features:")
print(df_features[['delta_y', 'delta_x', 'accel_y', 'speed']].describe())

## 4. Frequency Domain Features

Apply Fourier transformation similar to ML4QS Chapter 4's FrequencyAbstraction.

In [None]:
def extract_frequency_features(signal_data, sampling_rate=30):
    """
    Extract frequency domain features using FFT
    Similar to ML4QS Chapter 4 FrequencyAbstraction
    """
    # Remove NaN values
    clean_signal = signal_data.dropna()
    
    if len(clean_signal) < 10:
        return {}
    
    # Apply FFT
    fft_values = fft(clean_signal)
    fft_freq = fftfreq(len(clean_signal), 1/sampling_rate)
    
    # Get positive frequencies only
    positive_freq_idx = fft_freq > 0
    fft_magnitude = np.abs(fft_values[positive_freq_idx])
    positive_freqs = fft_freq[positive_freq_idx]
    
    # Extract features
    features = {
        'dominant_freq': positive_freqs[np.argmax(fft_magnitude)],
        'max_magnitude': np.max(fft_magnitude),
        'mean_magnitude': np.mean(fft_magnitude),
        'freq_centroid': np.sum(positive_freqs * fft_magnitude) / np.sum(fft_magnitude),
        'spectral_energy': np.sum(fft_magnitude**2)
    }
    
    return features

# Extract frequency features for position signals
freq_features_y = extract_frequency_features(df_features['mid_y'])
freq_features_x = extract_frequency_features(df_features['center_x'])

print("Frequency domain features (Y coordinate):")
for key, value in freq_features_y.items():
    print(f"  {key}: {value:.4f}")

print("\nFrequency domain features (X coordinate):")
for key, value in freq_features_x.items():
    print(f"  {key}: {value:.4f}")

## 5. Pattern Detection for Ball Hits

Implement the ball hit detection algorithm using derivative analysis.

In [None]:
# Ball hit detection using pattern recognition
df_features['ball_hit'] = 0
minimum_change_frames_for_hit = 25

# Detect direction changes in vertical movement
for i in range(1, len(df_features) - int(minimum_change_frames_for_hit * 1.2)):
    # Check for direction change in vertical movement
    negative_position_change = (df_features['delta_y'].iloc[i] > 0 and 
                               df_features['delta_y'].iloc[i+1] < 0)
    positive_position_change = (df_features['delta_y'].iloc[i] < 0 and 
                               df_features['delta_y'].iloc[i+1] > 0)
    
    if negative_position_change or positive_position_change:
        change_count = 0
        
        # Count consistent direction changes in following frames
        for change_frame in range(i+1, i + int(minimum_change_frames_for_hit * 1.2) + 1):
            if change_frame >= len(df_features):
                break
                
            negative_following = (df_features['delta_y'].iloc[i] > 0 and 
                                 df_features['delta_y'].iloc[change_frame] < 0)
            positive_following = (df_features['delta_y'].iloc[i] < 0 and 
                                 df_features['delta_y'].iloc[change_frame] > 0)
            
            if ((negative_position_change and negative_following) or 
                (positive_position_change and positive_following)):
                change_count += 1
        
        # Mark as ball hit if enough consistent changes
        if change_count > minimum_change_frames_for_hit - 1:
            df_features.loc[i, 'ball_hit'] = 1

# Get ball hit frames
ball_hit_frames = df_features[df_features['ball_hit'] == 1].index.tolist()
print(f"Detected {len(ball_hit_frames)} ball hits at frames: {ball_hit_frames}")

## 6. Coordinate Transformation Features

Convert pixel coordinates to real-world measurements and court-relative positions.

In [None]:
# Court dimensions (tennis court in meters)
COURT_LENGTH = 23.77  # meters
COURT_WIDTH = 10.97   # meters

# Assume video dimensions and court mapping
VIDEO_WIDTH = 1920
VIDEO_HEIGHT = 1080

# Simple linear transformation (assumes court fills most of frame)
def pixel_to_court_coordinates(x_pixel, y_pixel):
    """
    Convert pixel coordinates to court coordinates
    This is a simplified transformation
    """
    # Normalize to [0,1]
    x_norm = x_pixel / VIDEO_WIDTH
    y_norm = y_pixel / VIDEO_HEIGHT
    
    # Map to court dimensions
    court_x = x_norm * COURT_LENGTH
    court_y = (1 - y_norm) * COURT_WIDTH  # Flip Y axis
    
    return court_x, court_y

# Apply coordinate transformation
court_coords = [pixel_to_court_coordinates(x, y) for x, y in 
                zip(df_features['center_x'], df_features['center_y'])]

df_features['court_x'] = [coord[0] for coord in court_coords]
df_features['court_y'] = [coord[1] for coord in court_coords]

# Calculate real-world velocities (m/s at 30fps)
df_features['velocity_x_ms'] = df_features['court_x'].diff() * 30  # m/s
df_features['velocity_y_ms'] = df_features['court_y'].diff() * 30  # m/s
df_features['speed_ms'] = np.sqrt(df_features['velocity_x_ms']**2 + df_features['velocity_y_ms']**2)

print("Court coordinate features:")
print(df_features[['court_x', 'court_y', 'speed_ms']].describe())

## 7. Spatial Zone Features

Extract court zone-based features for spatial analysis.

In [None]:
# Define court zones
def get_court_zone(court_x, court_y):
    """
    Classify ball position into court zones
    """
    # Divide court into zones
    if court_x < COURT_LENGTH / 3:
        x_zone = 'back'
    elif court_x < 2 * COURT_LENGTH / 3:
        x_zone = 'middle'
    else:
        x_zone = 'front'
    
    if court_y < COURT_WIDTH / 3:
        y_zone = 'left'
    elif court_y < 2 * COURT_WIDTH / 3:
        y_zone = 'center'
    else:
        y_zone = 'right'
    
    return f"{x_zone}_{y_zone}"

# Apply zone classification
df_features['court_zone'] = [get_court_zone(x, y) for x, y in 
                            zip(df_features['court_x'], df_features['court_y'])]

# Zone-based features
zone_counts = df_features['court_zone'].value_counts()
print("Ball position distribution by court zone:")
print(zone_counts)

# Calculate time spent in each zone
total_frames = len(df_features)
print("\nTime percentage in each zone:")
for zone, count in zone_counts.items():
    percentage = (count / total_frames) * 100
    print(f"  {zone}: {percentage:.1f}%")

## 8. Statistical Features Over Time Windows

Extract statistical features over different time windows.

In [None]:
# Multiple window sizes for temporal abstraction
window_sizes = [3, 10, 20, 50]

for window in window_sizes:
    # Position statistics
    df_features[f'y_mean_{window}'] = df_features['mid_y'].rolling(window, min_periods=1).mean()
    df_features[f'y_std_{window}'] = df_features['mid_y'].rolling(window, min_periods=1).std()
    df_features[f'y_range_{window}'] = (df_features['mid_y'].rolling(window, min_periods=1).max() - 
                                       df_features['mid_y'].rolling(window, min_periods=1).min())
    
    # Speed statistics
    df_features[f'speed_mean_{window}'] = df_features['speed'].rolling(window, min_periods=1).mean()
    df_features[f'speed_max_{window}'] = df_features['speed'].rolling(window, min_periods=1).max()
    
    # Direction change indicators
    direction_changes = (df_features['delta_y'].rolling(window, min_periods=1)
                        .apply(lambda x: np.sum(np.diff(np.sign(x)) != 0)))
    df_features[f'direction_changes_{window}'] = direction_changes

# Display some statistical features
stat_cols = [col for col in df_features.columns if any(f'_{w}' in col for w in window_sizes)]
print(f"Created {len(stat_cols)} statistical features over different time windows")
print("\nSample statistical features:")
print(df_features[stat_cols[:6]].head())

## 9. Visualization of Features and Patterns

Visualize extracted features and detected patterns.

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
frames = range(len(df_features))

# Ball position and rolling mean
axes[0,0].plot(frames, df_features['mid_y'], 'b-', alpha=0.5, label='Raw position')
axes[0,0].plot(frames, df_features['mid_y_rolling_mean'], 'r-', linewidth=2, label='Rolling mean')
axes[0,0].scatter(ball_hit_frames, df_features.loc[ball_hit_frames, 'mid_y_rolling_mean'], 
                 color='red', s=100, marker='o', label='Ball hits', zorder=5)
axes[0,0].set_title('Ball Y Position with Detected Hits')
axes[0,0].set_ylabel('Y Position (pixels)')
axes[0,0].legend()
axes[0,0].grid(True)

# Velocity (delta_y)
axes[0,1].plot(frames, df_features['delta_y'], 'g-', linewidth=1)
axes[0,1].axhline(y=0, color='k', linestyle='--', alpha=0.5)
axes[0,1].scatter(ball_hit_frames, df_features.loc[ball_hit_frames, 'delta_y'], 
                 color='red', s=100, marker='o', zorder=5)
axes[0,1].set_title('Vertical Velocity (delta_y)')
axes[0,1].set_ylabel('Velocity (pixels/frame)')
axes[0,1].grid(True)

# Speed over time
axes[1,0].plot(frames, df_features['speed'], 'purple', linewidth=1)
axes[1,0].plot(frames, df_features['speed_mean_10'], 'orange', linewidth=2, label='10-frame mean')
axes[1,0].set_title('Ball Speed')
axes[1,0].set_ylabel('Speed (pixels/frame)')
axes[1,0].legend()
axes[1,0].grid(True)

# Court coordinates
axes[1,1].plot(df_features['court_x'], df_features['court_y'], 'b-', alpha=0.7, linewidth=1)
axes[1,1].scatter(df_features.loc[ball_hit_frames, 'court_x'], 
                 df_features.loc[ball_hit_frames, 'court_y'], 
                 color='red', s=100, marker='o', zorder=5)
axes[1,1].set_title('Ball Trajectory on Court')
axes[1,1].set_xlabel('Court X (meters)')
axes[1,1].set_ylabel('Court Y (meters)')
axes[1,1].grid(True)

# Acceleration
axes[2,0].plot(frames, df_features['accel_y'], 'brown', linewidth=1)
axes[2,0].axhline(y=0, color='k', linestyle='--', alpha=0.5)
axes[2,0].set_title('Vertical Acceleration')
axes[2,0].set_ylabel('Acceleration (pixels/frame²)')
axes[2,0].set_xlabel('Frame')
axes[2,0].grid(True)

# Feature correlation heatmap
feature_cols = ['mid_y_rolling_mean', 'delta_y', 'speed', 'accel_y', 'y_std_10', 'direction_changes_10']
corr_matrix = df_features[feature_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[2,1])
axes[2,1].set_title('Feature Correlation Matrix')

plt.tight_layout()
plt.show()

## 10. Feature Summary and Export

Create final feature set for machine learning applications.

In [None]:
# Select final feature set
feature_columns = [
    # Position features
    'center_x', 'center_y', 'court_x', 'court_y',
    
    # Temporal features
    'mid_y_rolling_mean', 'mid_y_rolling_std',
    'center_x_rolling_mean', 'center_x_rolling_std',
    
    # Motion features
    'delta_y', 'delta_x', 'speed', 'direction',
    'accel_y', 'accel_x',
    'velocity_x_ms', 'velocity_y_ms', 'speed_ms',
    
    # Statistical features (select representative ones)
    'y_mean_10', 'y_std_10', 'y_range_10',
    'speed_mean_10', 'speed_max_10',
    'direction_changes_10',
    
    # Target variable
    'ball_hit'
]

# Create final feature dataset
df_features_final = df_features[feature_columns].copy()
df_features_final['frame'] = range(len(df_features_final))
df_features_final['court_zone'] = df_features['court_zone']

# Handle any remaining NaN values
df_features_final = df_features_final.fillna(0)

print(f"Final feature set shape: {df_features_final.shape}")
print(f"Ball hits detected: {df_features_final['ball_hit'].sum()}")

# Feature importance based on correlation with ball_hit
numeric_features = df_features_final.select_dtypes(include=[np.number]).columns
correlations = df_features_final[numeric_features].corrwith(df_features_final['ball_hit']).abs().sort_values(ascending=False)

print("\nTop 10 features correlated with ball hits:")
for i, (feature, corr) in enumerate(correlations.head(10).items()):
    if feature != 'ball_hit':
        print(f"{i+1:2d}. {feature:20s}: {corr:.4f}")

# Save feature set
df_features_final.to_csv('tennis_features_chapter4.csv', index=False)
print("\nFeature set saved to 'tennis_features_chapter4.csv'")

# Summary statistics
print("\nFeature summary statistics:")
print(df_features_final[numeric_features].describe())

## Summary

This notebook demonstrated Chapter 4 concepts:

1. **Temporal Features**: Created rolling window aggregations and time-based statistics
2. **Velocity Analysis**: Calculated delta_y and delta_x for motion analysis
3. **Pattern Detection**: Implemented ball hit detection using derivative analysis
4. **Frequency Features**: Extracted spectral characteristics using FFT
5. **Coordinate Transformation**: Converted pixels to real-world court coordinates
6. **Spatial Features**: Created court zone classifications and spatial statistics
7. **Multi-scale Features**: Generated features over multiple time windows
8. **Feature Selection**: Identified most relevant features for ball hit detection

The extracted features provide a comprehensive representation of ball motion patterns suitable for machine learning applications in tennis analysis.