In [None]:
# T20 World Cup 2022 Analysis

# Importing essential libraries for data analysis and visualization
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Setting up a professional visualization theme
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("muted")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['font.family'] = 'Arial'

# --- 1. Data Loading and Preprocessing ---
# Load the T20 World Cup 2022 dataset
df = pd.read_csv('t20-world-cup-22.csv')

# Display dataset overview
print("Dataset Overview:")
print(f"Shape: {df.shape}")
print("\nColumn Types:")
print(df.dtypes)
print("\nFirst 5 Rows:")
print(df.head())

# Handle missing values (e.g., for cancelled matches)
df.fillna({'winner': 'No Result', 'won by': 'No Result', 'player of the match': 'N/A',
           'top scorer': 'N/A', 'highest score': 0, 'best bowler': 'N/A', 'best bowling figure': 'N/A'}, inplace=True)

# Convert numerical columns to appropriate types
df['first innings score'] = pd.to_numeric(df['first innings score'], errors='coerce').fillna(0)
df['second innings score'] = pd.to_numeric(df['second innings score'], errors='coerce').fillna(0)
df['highest score'] = pd.to_numeric(df['highest score'], errors='coerce').fillna(0)

# --- 2. Exploratory Data Analysis (EDA) ---
# Match outcomes by winner
fig1 = px.bar(df, x='winner', title='Match Wins by Team',
              labels={'winner': 'Team', 'count': 'Number of Wins'},
              color='winner', color_discrete_sequence=px.colors.qualitative.Set2)
fig1.update_layout(showlegend=False, height=500, title_x=0.5)
fig1.show()

# Toss decision impact
fig2 = px.histogram(df, x='toss decision', color='winner', barmode='group',
                    title='Toss Decision Impact on Match Outcome',
                    labels={'toss decision': 'Toss Decision', 'count': 'Number of Matches'})
fig2.update_layout(height=500, title_x=0.5)
fig2.show()

# Top performers: Player of the Match
top_players = df['player of the match'].value_counts().head(10)
fig3 = px.bar(x=top_players.index, y=top_players.values, 
              title='Top 10 Players of the Match',
              labels={'x': 'Player', 'y': 'Number of Awards'},
              color=top_players.index, color_discrete_sequence=px.colors.qualitative.Plotly)
fig3.update_layout(showlegend=False, height=500, title_x=0.5)
fig3.show()

# Venue analysis
fig4 = px.histogram(df, x='venue', color='stage', barmode='stack',
                    title='Matches Played by Venue and Stage',
                    labels={'venue': 'Venue', 'count': 'Number of Matches'})
fig4.update_layout(height=500, title_x=0.5)
fig4.show()

# Batting vs. bowling performance (first innings)
fig5 = make_subplots(rows=1, cols=2, subplot_titles=('First Innings Scores', 'First Innings Wickets'))
fig5.add_trace(go.Histogram(x=df['first innings score'], nbinsx=20, name='Score'), row=1, col=1)
fig5.add_trace(go.Histogram(x=df['first innings wickets'], nbinsx=10, name='Wickets'), row=1, col=2)
fig5.update_layout(title_text='First Innings Performance Distribution', title_x=0.5, height=500)
fig5.show()

# --- Save Visualizations ---
# Save interactive plots as HTML for presentation
fig1.write_html('match_wins.html')
fig2.write_html('toss_decision.html')
fig3.write_html('top_players.html')
fig4.write_html('venue_analysis.html')
fig5.write_html('innings_performance.html')

# Save processed dataset
df.to_csv('processed_t20_world_cup.csv', index=False)
print("\nProcessed dataset saved as 'processed_t20_world_cup.csv'.")
print("Interactive visualizations saved as HTML files.")