# Tennis Dataset Analysis -
## Comprehensive Analysis of Multi-Video Tennis Temporal Dataset

- **Temporal Data Analysis**: Time-series analysis of player and ball movements
- **Data Quality Assessment**: Missing data patterns and detection rates
- **Feature Engineering**: Derived metrics from raw position data
- **Multi-object Tracking**: Coordinated analysis of players and ball
- **Court-calibrated Measurements**: Real-world distances and speeds
- **Statistical Analysis**: Performance metrics and movement patterns

## Data Loading and Initial Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

import data_analysis_help.dataset_analysis_helper as helper

plt.style.use('default')
sns.set_palette("husl")

print("TENNIS DATASET ANALYSIS - ML4QS Chapter 2")
print("=" * 60)
print(f"Analysis started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
dataset_file = 'tennis_comprehensive_temporal_dataset_20250622_012949.csv'

try:
    print(f"Loading dataset: {dataset_file}")
    df = pd.read_csv(dataset_file, index_col=0, parse_dates=True)
    
    print("Dataset loaded successfully")
    print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"Time range: {df.index.min()} to {df.index.max()}")
    print(f"Duration: {df['time_seconds'].max():.1f} seconds ({df['time_seconds'].max()/60:.1f} minutes)")
    
except FileNotFoundError:
    print(f"Dataset file not found: {dataset_file}")
    print("Please ensure the file exists in the current directory")
    raise
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

In [None]:
print("DATASET STRUCTURE OVERVIEW")
print("=" * 50)

column_categories = helper.categorize_columns(df)

print("Column Categories:")
for category, columns in column_categories.items():
    print(f"  {category}: {len(columns)} columns")
    if len(columns) <= 5:
        print(f"    {', '.join(columns)}")
    else:
        print(f"    {', '.join(columns[:3])}... (+{len(columns)-3} more)")

print(f"\nData Types:")
data_types = df.dtypes.value_counts()
for dtype, count in data_types.items():
    print(f"  {dtype}: {count} columns")

In [None]:
print("BASIC DATASET STATISTICS")
print("=" * 50)

detection_stats = helper.calculate_detection_rates(df)

if 'ball' in detection_stats:
    ball_stats = detection_stats['ball']
    print(f"Ball Detection Rate: {ball_stats['rate']:.1%} ({ball_stats['count']:,} / {ball_stats['total']:,} frames)")

for player_id in [1, 2]:
    player_key = f'player_{player_id}'
    if player_key in detection_stats:
        player_stats = detection_stats[player_key]
        print(f"Player {player_id} Detection Rate: {player_stats['rate']:.1%} ({player_stats['count']:,} / {player_stats['total']:,} frames)")

if 'both_players' in detection_stats:
    both_stats = detection_stats['both_players']
    print(f"Both Players Detected: {both_stats['rate']:.1%} ({both_stats['count']:,} frames)")

if 'all_objects' in detection_stats:
    all_stats = detection_stats['all_objects']
    print(f"All Objects Detected: {all_stats['rate']:.1%} ({all_stats['count']:,} frames)")

print(f"\nCourt Information:")
court_cols = [col for col in df.columns if col.startswith('court_') and 'meters' in col]
if court_cols:
    for col in court_cols[:5]:
        value = df[col].iloc[0] if not df[col].isna().all() else 'N/A'
        measurement_name = col.replace('court_', '').replace('_meters', '').replace('_', ' ').title()
        print(f"  {measurement_name}: {value}")

if 'ball_video_source' in df.columns:
    unique_videos = df['ball_video_source'].dropna().nunique()
    print(f"\nVideo Information:")
    print(f"  Unique videos in dataset: {unique_videos}")
elif 'player_1_video_source' in df.columns:
    unique_videos = df['player_1_video_source'].dropna().nunique()
    print(f"\nVideo Information:")
    print(f"  Unique videos in dataset: {unique_videos}")

## Data Quality Assessment

In [None]:
print("DATA QUALITY ASSESSMENT")
print("=" * 50)

missing_info = helper.analyze_missing_data(df)

print("Missing Data in Key Tracking Columns:")
for col, info in missing_info.items():
    print(f"  {col}: {info['percentage']:.1f}% missing ({info['count']:,} / {info['total']:,} frames)")

continuity_stats = helper.analyze_detection_continuity(df)

print(f"\nDetection Continuity Analysis:")
if 'ball' in continuity_stats:
    ball_continuity = continuity_stats['ball']
    print(f"  Ball detection streaks: {ball_continuity['streak_count']} streaks")
    print(f"    Average streak length: {ball_continuity['avg_length']:.1f} frames")
    print(f"    Longest streak: {ball_continuity['max_length']} frames ({ball_continuity['max_length']/30:.1f}s)")
    print(f"    Shortest streak: {ball_continuity['min_length']} frames")

consistency_info = helper.check_temporal_consistency(df)

print(f"\nTemporal Consistency:")
if consistency_info:
    print(f"  Expected frame interval: {consistency_info['expected_interval']:.4f}s (30 fps)")
    print(f"  Actual average interval: {consistency_info['actual_interval']:.4f}s")
    print(f"  Timing consistency: {'Good' if consistency_info['is_consistent'] else 'Variable'}")

In [None]:
fig = helper.create_data_quality_plots(df)
plt.show()

print("Data quality visualizations created")
sample_interval = max(1, len(df) // 1000)
print(f"Sampled every {sample_interval} frames for visualization clarity")

## Movement and Speed Analysis

In [None]:
print("MOVEMENT AND SPEED ANALYSIS")
print("=" * 50)

movement_stats = helper.calculate_movement_stats(df)

for player_id in [1, 2]:
    player_key = f'player_{player_id}'
    if player_key in movement_stats:
        stats = movement_stats[player_key]
        print(f"\nPlayer {player_id} Movement Statistics:")
        print(f"   Total movement samples: {stats['count']:,}")
        print(f"   Average speed: {stats['mean_speed']:.1f} km/h")
        print(f"   Median speed: {stats['median_speed']:.1f} km/h")
        print(f"   Maximum speed: {stats['max_speed']:.1f} km/h")
        print(f"   Speed variability (std): {stats['std_speed']:.1f} km/h")
        print(f"   Total distance traveled: {stats['total_distance']:.1f} meters")

if 'ball' in movement_stats:
    ball_stats = movement_stats['ball']
    print(f"\nBall Movement Statistics:")
    print(f"   Total movement samples: {ball_stats['count']:,}")
    print(f"   Average speed: {ball_stats['mean_speed']:.1f} km/h")
    print(f"   Median speed: {ball_stats['median_speed']:.1f} km/h")
    print(f"   Maximum speed: {ball_stats['max_speed']:.1f} km/h")
    print(f"   Speed variability (std): {ball_stats['std_speed']:.1f} km/h")
    print(f"   Total distance traveled: {ball_stats['total_distance']:.1f} meters")

region_stats = helper.analyze_court_regions(df)

print(f"\nMovement Patterns:")
for player_id in [1, 2]:
    player_key = f'player_{player_id}'
    if player_key in region_stats:
        print(f"\n   Player {player_id} court region usage:")
        regions = region_stats[player_key]
        for region, info in list(regions.items())[:5]:
            print(f"     {region.replace('_', ' ').title()}: {info['percentage']:.1f}% ({info['count']:,} frames)")

In [None]:
fig = helper.create_movement_analysis_plots(df)
plt.show()

print("Movement analysis visualizations created")

## Temporal Pattern Analysis

In [None]:
print("TEMPORAL PATTERN ANALYSIS")
print("=" * 50)

print("Detection Pattern Analysis:")

if 'video_transition' in df.columns:
    transitions = df['video_transition'].sum()
    print(f"   Total video transitions: {transitions}")
    if transitions > 0:
        print(f"   Average time between transitions: {df['time_seconds'].max() / transitions:.1f} seconds")

trajectory_stats = helper.analyze_ball_trajectory(df)

if trajectory_stats:
    print(f"\nBall Trajectory Analysis:")
    print(f"   Ball movement range: X = {trajectory_stats['x_range']:.1f} pixels, Y = {trajectory_stats['y_range']:.1f} pixels")
    print(f"   Ball center position: ({trajectory_stats['center_x']:.1f}, {trajectory_stats['center_y']:.1f}) pixels")
    print(f"   Ball movement direction: {trajectory_stats['upward_frames']} upward frames, {trajectory_stats['downward_frames']} downward frames")

interaction_stats = helper.analyze_player_interactions(df)

print(f"\nPlayer Interaction Analysis:")
if interaction_stats:
    print(f"   Frames with both players: {interaction_stats['frames_with_both']:,}")
    print(f"   Average distance between players: {interaction_stats['avg_distance_pixels']:.1f} pixels")
    print(f"   Minimum distance: {interaction_stats['min_distance_pixels']:.1f} pixels")
    print(f"   Maximum distance: {interaction_stats['max_distance_pixels']:.1f} pixels")
    if 'avg_distance_meters' in interaction_stats:
        print(f"   Average distance (meters): {interaction_stats['avg_distance_meters']:.1f}m")

activity_stats, activity_score = helper.calculate_activity_score(df)

print(f"\nActivity Intensity Analysis:")
if activity_stats:
    print(f"   Average activity score: {activity_stats['mean_score']:.2f} objects per frame")
    print(f"   High activity frames (3 objects): {activity_stats['high_activity_frames']:,} ({activity_stats['high_activity_pct']:.1%})")
    print(f"   Medium activity frames (2 objects): {activity_stats['medium_activity_frames']:,} ({activity_stats['medium_activity_pct']:.1%})")
    print(f"   Low activity frames (1 object): {activity_stats['low_activity_frames']:,} ({activity_stats['low_activity_pct']:.1%})")
    print(f"   No activity frames (0 objects): {activity_stats['no_activity_frames']:,} ({activity_stats['no_activity_pct']:.1%})")

variance_stats = helper.analyze_speed_variance(df)

print(f"\nSpeed Variance Analysis:")
for player_id in [1, 2]:
    player_key = f'player_{player_id}'
    if player_key in variance_stats:
        stats = variance_stats[player_key]
        print(f"   Player {player_id}:")
        print(f"     Speed variability (CV): {stats['coefficient_variation']:.2f}")
        print(f"     High-speed frames (>{stats['high_speed_threshold']:.1f} km/h): {stats['high_speed_frames']} ({stats['high_speed_percentage']:.1%})")

In [None]:
fig = helper.create_temporal_pattern_plots(df)
plt.show()

print("Temporal pattern visualizations created")

##  Performance Metrics and Summary

In [None]:
print("PERFORMANCE METRICS AND SUMMARY")
print("=" * 60)

summary_stats = helper.generate_summary_statistics(df, movement_stats, detection_stats, region_stats)

for category, stats in summary_stats.items():
    print(f"\n{category}:")
    for metric, value in stats.items():
        print(f"   {metric}: {value}")

insights, findings = helper.generate_insights_and_findings(detection_stats, movement_stats)

print(f"\nML4QS CHAPTER 2 INSIGHTS")
print("=" * 50)

for insight in insights:
    print(f"  {insight}")

print(f"\nKey Findings:")
for i, finding in enumerate(findings, 1):
    print(f"  {i}. {finding}")

print(f"\nANALYSIS COMPLETE!")
print(f"This dataset demonstrates successful application of ML4QS Chapter 2 concepts")
print(f"Ready for advanced tennis analytics and machine learning applications")
print("=" * 60)

In [None]:
print("EXPORTING ANALYSIS RESULTS")
print("=" * 40)

analysis_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

export_info = helper.export_analysis_results(summary_stats, insights, findings, dataset_file, analysis_timestamp)

if export_info['success']:
    print(f"Analysis results exported: {export_info['json_file']}")
    print(f"Analysis report exported: {export_info['report_file']}")
    
    if len(df) > 10000:
        sample_size = 10000
        sample_df = df.sample(n=sample_size, random_state=42)
        sample_filename = f'tennis_dataset_sample_{analysis_timestamp}.csv'
        sample_df.to_csv(sample_filename)
        print(f"Dataset sample exported: {sample_filename} ({sample_size:,} rows)")
    
    print(f"\nExport Summary:")
    print(f"   JSON results: {export_info['json_file']}")
    print(f"   Text report: {export_info['report_file']}")
    if len(df) > 10000:
        print(f"   Data sample: {sample_filename}")
else:
    print(f"Error exporting results: {export_info['error']}")

print(f"\nTENNIS DATASET ANALYSIS COMPLETE!")
print(f"Dataset successfully analyzed using ML4QS Chapter 2 methodologies")
print(f"Results demonstrate comprehensive temporal data analysis capabilities")