# **Loading & Understanding Data**


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/deliveries (1).csv')


# Display basic info
print("Dataset Information:\n")
df.info()

# Display first few rows
print("\nFirst 5 rows:\n")
print(df.head())

Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          260920 non-null  int64 
 1   inning            260920 non-null  int64 
 2   batting_team      260920 non-null  object
 3   bowling_team      260920 non-null  object
 4   over              260920 non-null  int64 
 5   ball              260920 non-null  int64 
 6   batter            260920 non-null  object
 7   bowler            260920 non-null  object
 8   non_striker       260920 non-null  object
 9   batsman_runs      260920 non-null  int64 
 10  extra_runs        260920 non-null  int64 
 11  total_runs        260920 non-null  int64 
 12  extras_type       14125 non-null   object
 13  is_wicket         260920 non-null  int64 
 14  player_dismissed  12950 non-null   object
 15  dismissal_kind    12950 non-null   object
 16  fielder         

# **Handling Missing Values**

In [None]:
# Check for missing values
print("\nMissing Values before handling:\n")
print(df.isnull().sum())


Missing Values before handling:

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batter                   0
bowler                   0
non_striker              0
batsman_runs             0
extra_runs               0
total_runs               0
extras_type         246795
is_wicket                0
player_dismissed    247970
dismissal_kind      247970
fielder             251566
dtype: int64


# **Define replacements for missing values**

In [None]:
# Define replacements for missing values
replacements = {
    "extras_type": "No Extra",
    "player_dismissed": "Not Out",
    "dismissal_kind": "Not Applicable",
    "fielder": "No Fielder Involved"
}



# Total Extra Runs Conceded by Teams (Sorted)

In [None]:
import pandas as pd
import plotly.graph_objects as go
# Apply replacements and convert to categorical (Fixed)
for col, replacement in replacements.items():
    df[col] = df[col].astype("category").replace({pd.NA: replacement})

# Group by bowling team and sum extra runs
top_extra_teams = df.groupby("bowling_team")["extra_runs"].sum().reset_index()

# Sort teams by extra runs in descending order
top_extra_teams = top_extra_teams.sort_values(by="extra_runs", ascending=False)

# Plot Bar Chart using Plotly
fig = go.Figure(data=[
    go.Bar(x=top_extra_teams["bowling_team"], y=top_extra_teams["extra_runs"], marker_color="red")
])

fig.update_layout(title="Total Extra Runs Conceded by Teams (Sorted)",
                  xaxis_title="Bowling Team",
                  yaxis_title="Total Extra Runs",
                  template="plotly_dark")

fig.show()



## **Total Runs Scored Per Over (All Matches)**

In [None]:
import pandas as pd
import plotly.express as px

# Convert 'over' to integer if needed
df = df.dropna(subset=['over'])  # Drop rows with missing 'over'
df['over'] = df['over'].astype(int)

# Filter for valid overs
df = df[df['over'] > 0]

# Group total runs per over across ALL matches
total_runs_per_over = df.groupby('over')['total_runs'].sum().reset_index()

# 📊 Plotly Visualization
fig = px.bar(
    total_runs_per_over,
    x='over',
    y='total_runs',
    text='total_runs',
    title='Total Runs Scored Per Over (All Matches)',
    labels={'over': 'Over Number', 'total_runs': 'Total Runs'},
)

# Customize appearance
fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    marker_color='royalblue'
)
fig.update_xaxes(dtick=1, title_text="Over Number")
fig.update_yaxes(title_text="Total Runs")

# Show the plot
fig.show()


In [None]:
fig = px.line(
    total_runs_per_over,
    x='over',
    y='total_runs',
    text='total_runs',
    title='Total Runs Scored Per Over (Line Chart)',
    labels={'over': 'Over Number', 'total_runs': 'Total Runs'},
    markers=True
)
fig.show()




In [None]:
import plotly.graph_objects as go
import pandas as pd

# Calculate average runs per over
avg_runs_per_over = total_runs_per_over['total_runs'].mean()

# Create line plot using Plotly Graph Objects
fig = go.Figure()

# Add line for total runs per over
fig.add_trace(go.Scatter(
    x=total_runs_per_over['over'],
    y=total_runs_per_over['total_runs'],
    mode='lines+markers',
    name='Total Runs',
    line=dict(color='royalblue', width=3),
    marker=dict(size=8, symbol='circle', color='royalblue')
))

# Add horizontal line for average
fig.add_hline(y=avg_runs_per_over,
              line=dict(color='firebrick', width=2, dash='dash'),
              annotation_text=f'Avg: {avg_runs_per_over:.2f} Runs/Over',
              annotation_position="top right",
              annotation_font=dict(size=12, color="firebrick")
)
ra
# Customize layout
fig.update_layout(
    title='Runs Per Over in IPL Matches (with Average Line)',
    xaxis_title='Over Number',
    yaxis_title='Total Runs',
    xaxis=dict(tickmode='linear', tick0=1, dtick=1),  # Ensure over numbers are 1,2,...20
    template='plotly_white',
    width=900,
    height=500
)

# Show plot
fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Assuming `deliveries` DataFrame already exists and has 'total_runs' column (runs scored per ball)
runs_per_ball = df['total_runs']

# Plotly Histogram for Run Distribution
fig = px.histogram(
    runs_per_ball,
    x=runs_per_ball,
    nbins=7,  # Since runs per ball are typically 0 to 6 (can adjust if needed)
    text_auto=True,
    title="Distribution of Runs Scored Per Ball",
    labels={'x': 'Runs Per Ball', 'y': 'Count'},
    color_discrete_sequence=['teal']  # Set color scheme
)

# Improve layout
fig.update_layout(
    bargap=0.1,  # Small gap between bars
    xaxis=dict(tickmode='linear', dtick=1),  # Show every integer run value
    template="plotly_white"
)

fig.show()


In [None]:
import plotly.express as px
import pandas as pd

# Assuming deliveries DataFrame already exists
runs_per_ball = df['total_runs']

# Count occurrences of each run (0, 1, 2, 3, 4, 6)
run_distribution = runs_per_ball.value_counts().reset_index()
run_distribution.columns = ['runs', 'count']

# Plot Pie Chart
fig = px.pie(
    run_distribution,
    names='runs',         # Run type (0, 1, 2, 3, 4, 6)
    values='count',       # Frequency of each run type
    title="Proportion of Runs Scored Per Ball",
    color_discrete_sequence=px.colors.sequential.Teal_r  # Use a nice color palette
)

fig.update_traces(textinfo='percent+label', pull=[0.05 if run == 0 else 0 for run in run_distribution['runs']])

fig.show()


In [None]:
# Top 10 batsmen in total runs
top_batsmen = df.groupby('batter')['batsman_runs'].sum().nlargest(10).reset_index()
fig1 = px.bar(top_batsmen, x='batter', y='batsman_runs', title='Top 10 Batsmen by Total Runs', text='batsman_runs')
fig1.show()

In [None]:
# Total number of wickets taken per over
wickets_per_over = df[df['player_dismissed'] != 'Not Out'].groupby('over').size()
fig2 = px.bar(x=wickets_per_over.index, y=wickets_per_over.values, title='Total Wickets Taken per Over', labels={'x':'Over', 'y':'Wickets'})
fig2.show()

# Pie chart for percentage of runs by a batsman against top 10 bowlers



In [None]:
# Pie chart for percentage of runs by a batsman against top 10 bowlers
top_bowlers = df['bowler'].value_counts().nlargest(10).index
df_batsman_bowler = df[df['bowler'].isin(top_bowlers)].groupby('bowler')['batsman_runs'].sum().reset_index()
fig3 = px.pie(df_batsman_bowler, values='batsman_runs', names='bowler', title='Percentage of Runs Against Top 10 Bowlers')
fig3.show()

# Total runs scored by each team

In [None]:
# Total runs scored by each team
total_runs_team = df.groupby('batting_team')['batsman_runs'].sum().reset_index()
fig4 = px.bar(total_runs_team, x='batting_team', y='batsman_runs', title='Total Runs Scored by Each Team', text='batsman_runs')
fig4.show()

In [None]:
# Top 5 bowlers with the most wickets
top_wicket_bowlers = df[df['player_dismissed'] != 'Not Out'].groupby('bowler').size().nlargest(5).reset_index(name='wickets')

# Plot
fig = px.bar(top_wicket_bowlers, x='bowler', y='wickets',
             title='Top 5 Bowlers with Most Wickets',
             labels={'bowler': 'Bowler', 'wickets': 'Total Wickets'},
             text='wickets')

fig.show()



In [None]:
# Filter only caught and stumped dismissals
wicketkeeper_dismissals = df[df['dismissal_kind'].isin(['caught', 'stumped'])]

# Count dismissals by wicketkeepers (fielder column)
top_wicketkeepers = wicketkeeper_dismissals.groupby('fielder').size().nlargest(5).reset_index(name='dismissals')

# Plot
fig = px.bar(top_wicketkeepers, x='fielder', y='dismissals',
             title='Top 5 Wicketkeepers with Most Dismissals',
             labels={'fielder': 'Wicketkeeper', 'dismissals': 'Total Dismissals'},
             text='dismissals')

fig.show()






In [None]:
# Calculate total runs scored by each player
batsman_runs = df.groupby('batter')['batsman_runs'].sum().reset_index()

# Calculate total wickets taken by each player
bowler_wickets = df[df['player_dismissed'] != 'Not Out'].groupby('bowler').size().reset_index(name='wickets')

# Merge both dataframes to find all-rounders
all_rounders = pd.merge(batsman_runs, bowler_wickets, left_on='batter', right_on='bowler', how='inner')

# Calculate all-rounder ranking based on combined performance (runs + wickets)
all_rounders['all_rounder_score'] = all_rounders['batsman_runs'] + (all_rounders['wickets'] * 20)  # Giving weight to wickets

# Select top 5 all-rounders
top_all_rounders = all_rounders.nlargest(5, 'all_rounder_score')

# Plot
fig = px.bar(top_all_rounders, x='batter', y='all_rounder_score',
             title='Top 5 All-Rounders (Runs + Wickets)',
             labels={'batter': 'All-Rounder', 'all_rounder_score': 'All-Rounder Score'},
             text='all_rounder_score')

fig.show()



 Pie chart for wickets taken by bowling teams

In [None]:
# Pie chart for wickets taken by bowling teams
wickets_by_team = df[df['player_dismissed'] != 'Not Out'].groupby('bowling_team').size().reset_index(name='wickets')
fig5 = px.pie(wickets_by_team, values='wickets', names='bowling_team', title='Wickets Taken by Bowling Teams')
fig5.show()

In [None]:
# Runs scored in powerplay (overs 1-6) and death overs (overs 16-20)
powerplay_runs = df[df['over'].between(1,6)]['batsman_runs'].sum()
death_over_runs = df[df['over'].between(16,20)]['batsman_runs'].sum()
fig6 = go.Figure(data=[
    go.Bar(name='Powerplay (1-6)', x=['Powerplay'], y=[powerplay_runs]),
    go.Bar(name='Death Overs (16-20)', x=['Death Overs'], y=[death_over_runs])
])
fig6.update_layout(title='Runs Scored in Powerplay and Death Overs', barmode='group')
fig6.show()

In [None]:
# Top 10 most frequent non-strikers
top_non_strikers = df['non_striker'].value_counts().nlargest(10).reset_index()
top_non_strikers.columns = ['non_striker', 'count']
fig7 = px.bar(top_non_strikers, x='non_striker', y='count', title='Top 10 Most Frequent Non-Strikers', labels={'non_striker': 'Non-Striker', 'count': 'Appearances'})
fig7.show()