# IPL Data Exploration

This notebook explores the processed IPL match data to understand feature distributions and relationships with the target variable (`team1_won`).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Define paths
# Adjust the path if your notebook is in a different location relative to the root
try:
    ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 
except NameError: # Handle case where __file__ is not defined (e.g., interactive environment)
    ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')

# Load the processed data (non-encoded version for easier interpretation)
features_path = os.path.join(PROCESSED_DATA_DIR, 'matches_features.csv')
df_features = pd.read_csv(features_path)

# Load the encoded data for potential correlation analysis
encoded_features_path = os.path.join(PROCESSED_DATA_DIR, 'matches_features_encoded.csv')
df_encoded = pd.read_csv(encoded_features_path)

print(f"Processed Features Data Shape: {df_features.shape}")
print(f"Encoded Features Data Shape: {df_encoded.shape}")

# Display first few rows of non-encoded data
display(df_features.head())

## Target Variable Distribution

Check the balance of the target variable `team1_won`.

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='team1_won', data=df_features)
plt.title('Distribution of Match Outcome (1 = Team 1 Won)')
plt.xlabel('Team 1 Won')
plt.ylabel('Count')
plt.xticks([0, 1], ['No (Team 2 Won)', 'Yes (Team 1 Won)'])
plt.show()

print("Target Variable Distribution:")
print(df_features['team1_won'].value_counts(normalize=True))

## Feature Correlation with Target

Let's look at the correlation of numerical features with the target variable in the encoded dataset. This helps identify potentially predictive features.

In [None]:
# Calculate correlation matrix for the encoded data
correlation_matrix = df_encoded.corr()

# Get correlation with the target variable 'team1_won'
target_correlation = correlation_matrix['team1_won'].drop('team1_won') # Drop self-correlation
target_correlation_sorted = target_correlation.sort_values(ascending=False)

plt.figure(figsize=(10, 10))
sns.barplot(x=target_correlation_sorted.values, y=target_correlation_sorted.index, palette='vlag')
plt.title('Feature Correlation with team1_won')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

print("
Top 10 Correlated Features with team1_won:")
print(target_correlation_sorted.head(10))
print("
Bottom 10 Correlated Features with team1_won:")
print(target_correlation_sorted.tail(10))

## Explore Specific Feature Relationships

Visualize the relationship between specific features and the outcome using the non-encoded data for better interpretability.

In [None]:
def plot_feature_vs_target(feature_name, df, target='team1_won'):
    ""Helper function to plot feature distribution against the target.""
    plt.figure(figsize=(12, 5))
    
    # Boxplot for numerical features
    if pd.api.types.is_numeric_dtype(df[feature_name]):
        sns.boxplot(x=target, y=feature_name, data=df)
        plt.title(f'{feature_name} vs. Match Outcome')
    # Countplot for categorical or discrete features (like toss decision)
    else:
        sns.countplot(x=feature_name, hue=target, data=df, palette='viridis')
        plt.title(f'{feature_name} Distribution by Match Outcome')
        plt.xticks(rotation=45, ha='right')
        
    plt.xlabel('Team 1 Won (0=No, 1=Yes)' if pd.api.types.is_numeric_dtype(df[feature_name]) else feature_name)
    plt.ylabel(feature_name if pd.api.types.is_numeric_dtype(df[feature_name]) else 'Count')
    plt.tight_layout()
    plt.show()

# --- Plotting Examples --- 

# Example 1: Head-to-Head Win Rate
plot_feature_vs_target('team1_h2h_winrate', df_features)

# Example 2: Team Form (Recent Performance)
plot_feature_vs_target('team1_form', df_features)

# Example 3: Venue Specific Win Rate
plot_feature_vs_target('team1_venue_win_rate', df_features)

# Example 4: Toss Decision 
# Need to use the non-encoded features dataframe for interpretable labels
plot_feature_vs_target('toss_decision', df_features)

# Example 5: Did Team 1 Win Toss?
plot_feature_vs_target('team1_won_toss', df_features)

# Add more plots for other features as needed:
# plot_feature_vs_target('team1_streak', df_features)
# plot_feature_vs_target('chose_to_bat', df_features)