# Merging files and creating dataset

### Importing required libraries

In [1]:
# --- Built-in ---
import calendar
import gc
import re
import warnings
from datetime import datetime, timedelta
from pathlib import Path
from collections import defaultdict

# --- Third-party ---
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import HTML, display
from scipy import stats
from scipy.stats import mannwhitneyu, pearsonr, spearmanr, ttest_ind, f_oneway
from tabulate import tabulate
import warnings
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# --- Settings ---
warnings.filterwarnings("ignore")

### Step 1 : Define base class for processing parquet files

In [2]:
class TennisDataProcessor:
    def __init__(self, base_path):
        self.base_path = Path(base_path)
        self.all_data = {}
        self.failed_files = []
        self.data_by_prefix = {}

    def extract_file_components(self, filename):
        """Extract file components: prefix_number"""
        match = re.match(r"([a-zA-Z_]+)_(\d+)", filename)
        if match:
            prefix = match.group(1)
            number = int(match.group(2))
            return prefix, number
        return filename, 0

    def get_sorted_folders(self):
        """Get sorted list of folders"""
        folders = [f for f in self.base_path.iterdir() if f.is_dir()]
        folders_sorted = sorted(folders, key=lambda x: x.name)
        return folders_sorted

    def get_file_types_in_folder(self, folder_path):
        """Extract file types in a folder"""
        parquet_files = list(folder_path.glob("*.parquet"))

        prefixes = set()
        prefix_counts = {}

        for file_path in parquet_files:
            prefix, _ = self.extract_file_components(file_path.stem)
            prefixes.add(prefix)

            if prefix not in prefix_counts:
                prefix_counts[prefix] = 0
            prefix_counts[prefix] += 1

        return prefixes, prefix_counts

    def find_missing_files(self):
        """Find folders missing required files"""
        folders = self.get_sorted_folders()

        print("\nüîç Checking missing files...")
        print("=" * 80)

        # Collect all prefixes
        all_prefixes = set()
        folder_prefixes = {}

        for folder in folders:
            prefixes, _ = self.get_file_types_in_folder(folder)
            folder_prefixes[folder.name] = prefixes
            all_prefixes.update(prefixes)

        # Find missing files
        missing_report = {}

        for prefix in sorted(all_prefixes):
            folders_with_prefix = [f for f, p in folder_prefixes.items() if prefix in p]
            folders_without_prefix = [
                f for f, p in folder_prefixes.items() if prefix not in p
            ]

            if folders_without_prefix:
                missing_report[prefix] = {
                    "found_in": len(folders_with_prefix),
                    "missing_in": folders_without_prefix,
                }

        # Show report
        if missing_report:
            print("\n‚ö†Ô∏è Missing files:")
            for prefix, info in missing_report.items():
                print(f"\nüìÅ '{prefix}':")
                print(f"   ‚úì Present in {info['found_in']} folders")
                print(f"   ‚úó Missing in folders:")
                for folder_name in info["missing_in"][:5]:
                    print(f"      - {folder_name}")
                if len(info["missing_in"]) > 5:
                    print(f"      ... and {len(info['missing_in']) - 5} more folders")
        else:
            print("‚úÖ All files exist in all folders")

        # Check similar names
        print("\nüîç Checking similar names...")
        similar_names = self.find_similar_names(all_prefixes)
        if similar_names:
            print("\n‚ö†Ô∏è Similar names found:")
            for group in similar_names:
                print(f"   ‚Ä¢ {', '.join(group)}")

        print("=" * 80)
        return missing_report

    def find_similar_names(self, names):
        """Find similar names (possible typos)"""
        from difflib import SequenceMatcher

        similar_groups = []
        processed = set()

        for name1 in names:
            if name1 in processed:
                continue

            group = [name1]
            for name2 in names:
                if name2 != name1 and name2 not in processed:
                    similarity = SequenceMatcher(None, name1, name2).ratio()
                    if similarity > 0.8:  # 80% similarity
                        group.append(name2)
                        processed.add(name2)

            if len(group) > 1:
                similar_groups.append(group)
            processed.add(name1)

        return similar_groups

    def show_available_folders(self, show_count=10, show_file_types=True):
        """Display list of available folders with details"""
        folders = self.get_sorted_folders()

        print("\n" + "=" * 80)
        print("üìÅ Available folders:")
        print("=" * 80)

        for i, folder in enumerate(folders[:show_count], 1):
            total_files = len(list(folder.glob("*.parquet")))

            print(f"\n{i:3}. üìÖ {folder.name}")
            print(f"     üìä Total files: {total_files}")

            if show_file_types and total_files > 0:
                prefixes, prefix_counts = self.get_file_types_in_folder(folder)

                if prefixes:
                    print(f"     üìÇ File types ({len(prefixes)} types):")

                    sorted_prefixes = sorted(prefix_counts.items())
                    max_items_per_line = 3

                    for j in range(0, len(sorted_prefixes), max_items_per_line):
                        line_items = sorted_prefixes[j : j + max_items_per_line]
                        line_text = "        "
                        for prefix, count in line_items:
                            line_text += f"‚Ä¢ {prefix}({count})  "
                        print(line_text)

        if len(folders) > show_count:
            print(f"\n     ... and {len(folders) - show_count} more folders")

        print("\n" + "=" * 80)
        print(f"üìä Summary:")
        print(f"   ‚Ä¢ Total folders: {len(folders)}")
        total_all_files = sum(len(list(f.glob("*.parquet"))) for f in folders)
        print(f"   ‚Ä¢ Total files: {total_all_files:,}")
        print("=" * 80 + "\n")

        return folders

    def inspect_parquet_file(
        self, file_path=None, folder_idx=1, file_prefix=None, rows=5
    ):
        """
        Inspect content of a parquet file

        Parameters:
        -----------
        file_path: direct file path
        folder_idx: folder index
        file_prefix: target file prefix
        rows: number of rows to display
        """
        if file_path:
            # Using direct path
            target_file = Path(file_path)
        else:
            # Finding file based on parameters
            folders = self.get_sorted_folders()
            if folder_idx > len(folders):
                print(f"‚ùå Folder number {folder_idx} does not exist")
                return None

            folder = folders[folder_idx - 1]

            if file_prefix:
                # First file with this prefix
                files = list(folder.glob(f"{file_prefix}_*.parquet"))
                if not files:
                    print(
                        f"‚ùå No file with prefix '{file_prefix}' found in folder {folder.name}"
                    )
                    return None
                target_file = files[0]
            else:
                # Select first file
                files = list(folder.glob("*.parquet"))
                if not files:
                    print(f"‚ùå No parquet files found in folder {folder.name}")
                    return None
                target_file = files[0]

        print(f"\nüìÑ Inspecting file: {target_file.name}")
        print(f"üìÅ From folder: {target_file.parent.name}")
        print("=" * 80)

        try:
            # Read file
            df = pd.read_parquet(target_file)

            # General info
            print(f"\nüìä General Info:")
            print(f"   ‚Ä¢ Rows: {len(df):,}")
            print(f"   ‚Ä¢ Columns: {len(df.columns)}")
            print(
                f"   ‚Ä¢ Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
            )

            # Column info
            print(f"\nüìã Columns and Types:")
            for col in df.columns:
                print(f"   ‚Ä¢ {col}: {df[col].dtype}")

            # First rows
            print(f"\nüîç Sample Data ({rows} rows):")
            print(df.head(rows).to_string())

            # Descriptive stats
            print(f"\nüìà Descriptive Statistics:")
            print(df.describe().to_string())

            # Null values
            null_counts = df.isnull().sum()
            if null_counts.any():
                print(f"\n‚ö†Ô∏è Null Values:")
                for col, count in null_counts[null_counts > 0].items():
                    print(f"   ‚Ä¢ {col}: {count} ({count/len(df)*100:.1f}%)")

            return df

        except Exception as e:
            print(f"‚ùå Error reading file: {e}")
            return None

    def select_folders_to_process(self, folder_range="all"):
        """Select folders for processing"""
        all_folders = self.get_sorted_folders()
        print(f"Total folders: {len(all_folders)}")

        selected_folders = []

        if folder_range == "all":
            selected_folders = all_folders
            print(f"‚úì All {len(selected_folders)} folders selected")

        elif isinstance(folder_range, int):
            if 1 <= folder_range <= len(all_folders):
                selected_folders = [all_folders[folder_range - 1]]
                print(
                    f"‚úì Folder number {folder_range} selected: {selected_folders[0].name}"
                )
            else:
                print(f"‚úó Invalid folder number: {folder_range}")

        elif isinstance(folder_range, tuple) and len(folder_range) == 2:
            start, end = folder_range
            if 1 <= start <= end <= len(all_folders):
                selected_folders = all_folders[start - 1 : end]
                print(
                    f"‚úì {len(selected_folders)} folders selected from {start} to {end}"
                )
                print(f"  From: {selected_folders[0].name}")
                print(f"  To: {selected_folders[-1].name}")
            else:
                print(f"‚úó Invalid range: {start} to {end}")

        elif isinstance(folder_range, str) and ":" in folder_range:
            start_date, end_date = folder_range.split(":")
            selected_folders = [
                f for f in all_folders if start_date <= f.name <= end_date
            ]
            if selected_folders:
                print(f"‚úì {len(selected_folders)} folders selected in date range")
                print(f"  From: {selected_folders[0].name}")
                print(f"  To: {selected_folders[-1].name}")
            else:
                print(f"‚úó No folder found between {start_date} and {end_date}")

        elif isinstance(folder_range, str):
            selected_folders = [f for f in all_folders if f.name == folder_range]
            if selected_folders:
                print(f"‚úì Folder '{folder_range}' selected")
            else:
                print(f"‚úó Folder '{folder_range}' not found")

        return selected_folders

    def read_parquet_safe(self, file_path):
        """Read parquet file with error handling"""
        try:
            df = pd.read_parquet(file_path, engine="fastparquet")
            return df
        except:
            try:
                df = pd.read_parquet(file_path, engine="pyarrow")
                return df
            except:
                try:
                    import pyarrow.parquet as pq

                    table = pq.read_table(file_path)
                    df = table.to_pandas(ignore_metadata=True)
                    return df
                except:
                    return None

    def process_folders(self, folder_range="all", sample_size=None):
        """Process selected folders"""
        selected_folders = self.select_folders_to_process(folder_range)

        if not selected_folders:
            print("No folder was selected for processing!")
            return

        print(f"\nStarting processing of {len(selected_folders)} folders...")

        for folder_idx, folder in enumerate(selected_folders, 1):
            print(f"\n{'='*50}")
            print(f"Processing folder {folder_idx}/{len(selected_folders)}: {folder.name}")
            print(f"{'='*50}")

            parquet_files = list(folder.glob("*.parquet"))

            if sample_size and sample_size < len(parquet_files):
                parquet_files = parquet_files[:sample_size]
                print(
                    f"üìå Only {sample_size} files out of {len(list(folder.glob('*.parquet')))} files will be processed"
                )

            success_count = 0

            for file_idx, file_path in enumerate(parquet_files, 1):
                if file_idx % 100 == 0:
                    print(f"  Processing file {file_idx}/{len(parquet_files)}...")

                prefix, file_number = self.extract_file_components(file_path.stem)
                df = self.read_parquet_safe(file_path)

                if df is not None:
                    df["source_date"] = folder.name
                    df["source_file"] = file_path.stem
                    df["file_prefix"] = prefix
                    df["file_number"] = file_number
                    df["folder_order"] = folder_idx

                    if prefix not in self.data_by_prefix:
                        self.data_by_prefix[prefix] = []

                    self.data_by_prefix[prefix].append(
                        {
                            "date": folder.name,
                            "folder_order": folder_idx,
                            "file_number": file_number,
                            "data": df,
                        }
                    )

                    success_count += 1
                else:
                    self.failed_files.append(
                        {"folder": folder.name, "file": file_path.name}
                    )

            print(f"  ‚úì {success_count} files read out of {len(parquet_files)}")
            gc.collect()

        print(f"\n{'='*50}")
        print(f"Processing completed!")
        print(f"  - Folders processed: {len(selected_folders)}")
        print(f"  - Prefix groups detected: {len(self.data_by_prefix)}")
        for prefix, items in self.data_by_prefix.items():
            print(f"    ‚Ä¢ {prefix}: {len(items)} files")
        print(f"{'='*50}")

    def save_current_batch(self, output_path, batch_name=None):
        """Save currently processed data"""
        output_path = Path(output_path)
        output_path.mkdir(exist_ok=True)

        if not batch_name:
            batch_name = datetime.now().strftime("%Y%m%d_%H%M%S")

        batch_folder = output_path / f"batch_{batch_name}"
        batch_folder.mkdir(exist_ok=True)

        for prefix, items in self.data_by_prefix.items():
            sorted_items = sorted(
                items, key=lambda x: (x["folder_order"], x["file_number"])
            )
            dfs = [item["data"] for item in sorted_items]

            if dfs:
                combined_df = pd.concat(dfs, ignore_index=True)
                safe_prefix = prefix.replace("/", "_").replace("\\", "_")

                combined_df.to_csv(batch_folder / f"{safe_prefix}.csv", index=False)
                print(f"‚úì Saved: {safe_prefix}.csv ({len(combined_df)} rows)")

        if self.failed_files:
            pd.DataFrame(self.failed_files).to_csv(
                batch_folder / "failed_files.csv", index=False
            )

        print(f"\n‚úì All data saved in '{batch_folder}'")
        return batch_folder

    def clear_memory(self):
        """Clear memory"""
        self.data_by_prefix = {}
        self.all_data = {}
        gc.collect()
        print("‚úì Memory cleared")


def combine_batches(batch_folder_path):
    """Combine all saved batches"""
    batch_folder = Path(batch_folder_path)
    all_batches = list(batch_folder.glob("batch_*"))

    combined_data = {}

    for batch in all_batches:
        print(f"Reading {batch.name}...")
        csv_files = list(batch.glob("*.csv"))

        for csv_file in csv_files:
            if csv_file.name != "failed_files.csv":
                prefix = csv_file.stem
                df = pd.read_csv(csv_file)

                if prefix not in combined_data:
                    combined_data[prefix] = []
                combined_data[prefix].append(df)

    final_data = {}
    for prefix, dfs in combined_data.items():
        final_data[prefix] = pd.concat(dfs, ignore_index=True)
        print(f"‚úì {prefix}: {len(final_data[prefix])} rows")

    return final_data

### Step 2 : Initial data exploration

In [None]:
# Data path
base_path = "../tennis_data"

# Create processor
processor = TennisDataProcessor(base_path)

# Show folders
processor.show_available_folders(show_count=3)

# Find missing files
missing_report = processor.find_missing_files()

# Inspect a sample file
df_sample = processor.inspect_parquet_file(folder_idx=1, file_prefix="away_team")

### Step 3 : Process 60 folders in 6 batches of 10 (execution may take some time)

In [None]:
# Processing 60 folders in 6 batches of 10
base_path = "../tennis_data"
output_path = "../CSV_Files"

for batch_num in range(6):
    start = batch_num * 10 + 1
    end = min((batch_num + 1) * 10, 60)

    print(f"\n{'#'*60}")
    print(f"Processing batch {batch_num + 1}: folders {start} to {end}")
    print(f"{'#'*60}")

    processor = TennisDataProcessor(base_path)
    processor.process_folders(folder_range=(start, end))
    processor.save_current_batch(output_path, batch_name=f"folders_{start}_{end}")
    processor.clear_memory()

    print(f"‚úì Batch {batch_num + 1} completed")

### Step 4 : Final merging and saving

In [None]:
def save_final_data(final_data, output_path="./final_data", format="parquet"):
    """
    Save final data with error handling

    Parameters:
    -----------
    final_data: dict of DataFrames
    output_path: save path
    format: 'parquet', 'csv', or 'both'
    """
    # Create output directory
    output_dir = Path(output_path)
    output_dir.mkdir(exist_ok=True, parents=True)

    print(f"\n{'='*60}")
    print(f"üìÅ Saving data to: {output_dir.absolute()}")
    print(f"{'='*60}")

    successful_saves = []
    failed_saves = []

    for prefix, df in final_data.items():
        print(f"\nüìù Processing {prefix}...")
        print(f"   ‚Ä¢ Rows: {len(df):,}")
        print(f"   ‚Ä¢ Columns: {len(df.columns)}")
        print(f"   ‚Ä¢ Approx Size: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

        success = False

        # Save as parquet
        if format in ["parquet", "both"]:
            parquet_file = output_dir / f"{prefix}_complete.parquet"
            try:
                df.to_parquet(parquet_file)
                print(f"   ‚úì Parquet saved: {parquet_file.name}")
                success = True
            except Exception as e:
                print(f"   ‚úó Error saving Parquet: {e}")

        # Save as CSV
        if format in ["csv", "both"] or (format == "parquet" and not success):
            csv_file = output_dir / f"{prefix}_complete.csv"
            try:
                df.to_csv(csv_file, index=False)
                print(f"   ‚úì CSV saved: {csv_file.name}")
                success = True
            except Exception as e:
                print(f"   ‚úó Error saving CSV: {e}")

        if success:
            successful_saves.append(prefix)
        else:
            failed_saves.append(prefix)

    # Final report
    print(f"\n{'='*60}")
    print(f"üìä Final Report:")
    print(f"   ‚úì Successful: {len(successful_saves)} files")
    print(f"   ‚úó Failed: {len(failed_saves)} files")

    if failed_saves:
        print(f"\n‚ö†Ô∏è Failed files:")
        for prefix in failed_saves:
            print(f"   - {prefix}")

    print(f"{'='*60}\n")

    return successful_saves, failed_saves


# Load combined data
final_data = combine_batches("../CSV_Files")

# Save output in parquet format
successful, failed = save_final_data(
    final_data,
    output_path="../Final_parquet",
    format="parquet",
)

### Step 5 : Verify data saving integrity

In [None]:
def load_final_data(
    directory_path="../Final_parquet",
    file_format="parquet",
):
    """Load saved files"""
    dir_path = Path(directory_path)

    if not dir_path.exists():
        print(f"‚ùå Folder {directory_path} does not exist")
        return {}

    loaded_data = {}
    extension = ".parquet" if file_format == "parquet" else ".csv"

    files = list(dir_path.glob(f"*{extension}"))

    print(f"üìñ Reading {len(files)} {file_format} files...")

    for file in files:
        prefix = file.stem.replace("_complete", "")

        try:
            if file_format == "parquet":
                df = pd.read_parquet(file)
            else:
                df = pd.read_csv(file)

            loaded_data[prefix] = df
            print(f"   ‚úì {prefix}: {len(df):,} rows")
        except Exception as e:
            print(f"   ‚úó Error reading {file.name}: {e}")

    return loaded_data


# Load data
loaded_data = load_final_data(
    "../Final_parquet",
    file_format="parquet"
)

# Check sample
if "away_team" in loaded_data:
    print(f"\nSample data for away_team:")
    print(loaded_data["away_team"].head())

### Step 6 : Create dataset for analysis and data exploration

In [None]:
class TennisDataExplorer:
    def __init__(
        self, data_path="../Final_parquet"
    ):
        self.data_path = Path(data_path)
        self.datasets = {}
        self.metadata = {}

    def load_all_datasets(self):
        """Load all datasets"""
        files = {
            "away_team": "away_team_complete.parquet",
            "pbp": "pbp_complete.parquet",
            "away_team_score": "away_team_score_complete.parquet",
            "event": "event_complete.parquet",
            "home_team": "home_team_complete.parquet",
            "home_team_score": "home_team_score_complete.parquet",
            "round": "round_complete.parquet",
            "season": "season_complete.parquet",
            "time": "time_complete.parquet",
            "tournament": "tournament_complete.parquet",
            "venue": "venue_complete.parquet",
            "odds": "odds_complete.parquet",
            "statistics": "statistics_complete.parquet",
            "power": "power_complete.parquet",
            "votes": "votes_complete.parquet",
        }

        print("üîÑ Loading dataframes and aggregating into dataset")
        for name, filename in files.items():
            file_path = self.data_path / filename
            if file_path.exists():
                try:
                    # Only a few initial rows for quick check
                    self.datasets[name] = pd.read_parquet(file_path)
                    print(f"   ‚úì {name}: {len(self.datasets[name]):,} rows")
                except Exception as e:
                    print(f"   ‚úó Error loading {name}: {e}")

        return self.datasets

    def analyze_dataset_structure(self):
        """Analyze structure of all datasets"""
        analysis_report = {}

        for name, df in self.datasets.items():
            print(f"\n{'='*60}")
            print(f"üìä Analysis {name}")
            print(f"{'='*60}")

            analysis = {
                "shape": df.shape,
                "columns": list(df.columns),
                "dtypes": df.dtypes.to_dict(),
                "null_counts": df.isnull().sum().to_dict(),
                "memory_usage": df.memory_usage(deep=True).sum() / 1024**2,  # MB
                "sample_data": df.head(3).to_dict(),
            }

            # Show key info
            print(f"üìè Dimensions: {analysis['shape'][0]:,} √ó {analysis['shape'][1]}")
            print(f"üíæ Memory: {analysis['memory_usage']:.2f} MB")
            print(f"üìã Columns: {', '.join(analysis['columns'][:5])}")
            if len(analysis["columns"]) > 5:
                print(f"           ... and {len(analysis['columns'])-5} more columns")

            # Check potential key columns
            key_columns = []
            for col in df.columns:
                if any(
                    key in col.lower()
                    for key in ["id", "match", "game", "player", "team"]
                ):
                    key_columns.append(col)
                    unique_count = df[col].nunique()
                    print(f"   üîë {col}: {unique_count:,} unique values")

            analysis["potential_keys"] = key_columns
            analysis_report[name] = analysis

            # Show sample data
            print(f"\nüìù Sample data:")
            print(df.head(2).to_string())

        self.metadata = analysis_report
        return analysis_report

    def find_relationships(self):
        """Find relationships between datasets"""
        print("\n" + "=" * 60)
        print("üîó Checking relationships between datasets")
        print("=" * 60)

        relationships = {}

        # Collect all columns
        all_columns = {}
        for name, df in self.datasets.items():
            all_columns[name] = set(df.columns)

        # Find common columns
        for name1 in self.datasets:
            relationships[name1] = {}
            for name2 in self.datasets:
                if name1 != name2:
                    common_cols = all_columns[name1].intersection(all_columns[name2])
                    if common_cols:
                        relationships[name1][name2] = list(common_cols)

        # Show relationships
        for dataset, relations in relationships.items():
            if relations:
                print(f"\nüìä {dataset}:")
                for related, columns in relations.items():
                    if columns:
                        print(f"   ‚ÜîÔ∏è {related}: {', '.join(columns[:3])}")

        return relationships

    def create_data_profile(self):
        """Create complete data profile"""
        profile = {
            "overview": {},
            "columns_info": {},
            "relationships": {},
            "recommendations": [],
        }

        print("\n" + "=" * 60)
        print("üìà Creating data profile")
        print("=" * 60)

        # Overall info
        total_rows = sum(len(df) for df in self.datasets.values())
        total_columns = sum(len(df.columns) for df in self.datasets.values())
        total_memory = (
            sum(df.memory_usage(deep=True).sum() for df in self.datasets.values())
            / 1024**2
        )

        profile["overview"] = {
            "total_datasets": len(self.datasets),
            "total_rows": total_rows,
            "total_columns": total_columns,
            "total_memory_mb": total_memory,
        }

        print(f"\nüìä Overall summary:")
        print(f"   ‚Ä¢ Number of datasets: {len(self.datasets)}")
        print(f"   ‚Ä¢ Total rows: {total_rows:,}")
        print(f"   ‚Ä¢ Total columns: {total_columns}")
        print(f"   ‚Ä¢ Total size: {total_memory:.2f} MB")

        # Deeper analysis for key datasets
        key_datasets = ["event", "tournament", "pbp", "statistics"]

        for ds_name in key_datasets:
            if ds_name in self.datasets:
                df = self.datasets[ds_name]
                print(f"\nüîç Deep analysis {ds_name}:")

                # Check date columns
                date_cols = [
                    col
                    for col in df.columns
                    if "date" in col.lower() or "time" in col.lower()
                ]
                if date_cols:
                    for col in date_cols[:2]:
                        try:
                            print(f"   üìÖ {col}: from {df[col].min()} to {df[col].max()}")
                        except:
                            pass

                # Check ID columns
                id_cols = [col for col in df.columns if "id" in col.lower()]
                if id_cols:
                    for col in id_cols[:3]:
                        print(f"   üîë {col}: {df[col].nunique():,} unique values")

        return profile

    def generate_sample_queries(self):
        """Generate useful sample queries"""
        print("\n" + "=" * 60)
        print("üîç Sample possible analyses")
        print("=" * 60)

        queries = []

        # Based on available datasets
        if "tournament" in self.datasets:
            queries.append("Analyze different tournaments and number of matches")

        if "statistics" in self.datasets:
            queries.append("Analyze players' performance statistics over time")

        if "pbp" in self.datasets:
            queries.append("Analyze play-by-play details and scoring patterns")

        if "odds" in self.datasets:
            queries.append("Evaluate prediction accuracy based on odds")

        if "power" in self.datasets:
            queries.append("Analyze shot power and its impact on match outcome")

        for i, query in enumerate(queries, 1):
            print(f"   {i}. {query}")

        return queries


# Run initial analysis
explorer = TennisDataExplorer("../Final_parquet")
datasets = explorer.load_all_datasets()