In [30]:
import pandas as pd

data = pd.read_csv("innings_data.csv")

print(data.head())

            batter          bowler non_striker  runs_batter  runs_extras  \
0  Shayan Jahangir  Arshdeep Singh   SR Taylor            0            0   
1         AGS Gous  Arshdeep Singh   SR Taylor            0            0   
2         AGS Gous  Arshdeep Singh   SR Taylor            0            0   
3         AGS Gous  Arshdeep Singh   SR Taylor            0            1   
4         AGS Gous  Arshdeep Singh   SR Taylor            2            0   

   runs_total wickets_0_player_out wickets_0_kind                      team  \
0           0      Shayan Jahangir            lbw  United States of America   
1           0                  NaN            NaN  United States of America   
2           0                  NaN            NaN  United States of America   
3           1                  NaN            NaN  United States of America   
4           2                  NaN            NaN  United States of America   

   over  ...  wickets_0_fielders_0_name review_by review_umpire revi

### There are a lot of missing values in the dataset. Let's have a look at the missing values and data types.

In [31]:
#checking for the missing values in the dataset
missing_values = data.isnull().sum();

# checking data types of columns
data_types = data.dtypes

missing_values

batter                         0
bowler                         0
non_striker                    0
runs_batter                    0
runs_extras                    0
runs_total                     0
wickets_0_player_out         225
wickets_0_kind               225
team                           0
over                           0
extras_wides                 231
wickets_0_fielders_0_name    228
review_by                    235
review_umpire                235
review_batter                235
review_decision              235
review_type                  235
extras_legbyes               234
wickets_0_fielders_1_name    235
extras_noballs               235
extras_penalty               235
dtype: int64

In [32]:
data_types

batter                        object
bowler                        object
non_striker                   object
runs_batter                    int64
runs_extras                    int64
runs_total                     int64
wickets_0_player_out          object
wickets_0_kind                object
team                          object
over                           int64
extras_wides                 float64
wickets_0_fielders_0_name     object
review_by                     object
review_umpire                 object
review_batter                 object
review_decision               object
review_type                   object
extras_legbyes               float64
wickets_0_fielders_1_name     object
extras_noballs               float64
extras_penalty               float64
dtype: object

### Grouping the data for analysis

In [33]:
# total runs scored by each team
t_runs = data.groupby("team")["runs_total"].sum()

# wickets taken by each team
## notna() detects non-missing values for the items in the current dataset
t_wickets = data['wickets_0_player_out'].notna().groupby(data['team']).sum()

# total extras
t_extras = data[["team", "runs_extras", "extras_wides","extras_noballs","extras_legbyes","extras_penalty"]].groupby("team").sum()

# runs scored by each batter
batter_runs = data.groupby('batter')["runs_batter"].sum()

# balls faced by each batsman
balls_faced = data.groupby("batter").size()

# Strike rate of each batter
strike_rate = (batter_runs/balls_faced)*100

# Boundaries hit by each batter
boundaries = data[(data["runs_batter"] == 4) | (data["runs_batter"] == 6)].groupby(["batter","runs_batter"]).size().unstack(fill_value=0)

# Wickets taken by each bowler
w_taken = data["wickets_0_player_out"].notna().groupby(data["bowler"]).sum()

# Runs conceded by each bowler
runs_conceded = data.groupby("bowler")["runs_total"].sum()

# Balls bowled by each bowler
balls_bowled = data.groupby("bowler").size()

# economy rate of each bowler
economy = runs_conceded / (balls_bowled/6)

# dot balls bowled by each bowler
dot_balls = data[data["runs_total"] == 0].groupby("bowler").size()

# add all these stats into the dataframe
batter_stats = pd.DataFrame({
    "Runs": batter_runs,
    "Balls Faced": balls_faced,
    "Strike Rate" : strike_rate,
}).join(boundaries)

bowler_stats = pd.DataFrame({
    "Wickets": w_taken,
    "Runs Conceded": runs_conceded,
    "Balls Bowled" : balls_bowled,
    "Economy Rate": economy,
    "Dot Balls": dot_balls,
})

In [34]:
data

Unnamed: 0,batter,bowler,non_striker,runs_batter,runs_extras,runs_total,wickets_0_player_out,wickets_0_kind,team,over,...,wickets_0_fielders_0_name,review_by,review_umpire,review_batter,review_decision,review_type,extras_legbyes,wickets_0_fielders_1_name,extras_noballs,extras_penalty
0,Shayan Jahangir,Arshdeep Singh,SR Taylor,0,0,0,Shayan Jahangir,lbw,United States of America,0,...,,,,,,,,,,
1,AGS Gous,Arshdeep Singh,SR Taylor,0,0,0,,,United States of America,0,...,,,,,,,,,,
2,AGS Gous,Arshdeep Singh,SR Taylor,0,0,0,,,United States of America,0,...,,,,,,,,,,
3,AGS Gous,Arshdeep Singh,SR Taylor,0,1,1,,,United States of America,0,...,,,,,,,,,,
4,AGS Gous,Arshdeep Singh,SR Taylor,2,0,2,,,United States of America,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,SA Yadav,SN Netravalkar,S Dube,0,0,0,,,India,17,...,,,,,,,,,,
232,SA Yadav,SN Netravalkar,S Dube,1,0,1,,,India,17,...,,,,,,,,,,
233,SA Yadav,Ali Khan,S Dube,1,0,1,,,India,18,...,,,,,,,,,,
234,S Dube,Ali Khan,SA Yadav,0,1,1,,,India,18,...,,,,,,,,,,


In [35]:
t_extras

Unnamed: 0_level_0,runs_extras,extras_wides,extras_noballs,extras_legbyes,extras_penalty
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
India,9,2.0,1.0,1.0,5.0
United States of America,8,7.0,0.0,1.0,0.0


In [36]:
t_wickets

team
India                       3
United States of America    8
Name: wickets_0_player_out, dtype: int64

In [37]:
t_runs

team
India                       111
United States of America    110
Name: runs_total, dtype: int64

In [38]:
import plotly.graph_objects as go

# cumsum() :  calculate the cumulative sum of array elements along a specified axis or across all axes.
ind_runs_progression = data[data["team"]=="India"].groupby("over")["runs_total"].sum().cumsum()
usa_runs_progression = data[data["team"]=="United States of America"].groupby("over")["runs_total"].sum().cumsum()

# Purpose : initialize an empty figure object to which you can add one or more traces using the add_trace() method and customize its layout using the update_layout() method.
f=go.Figure()

# add_trace : This parameter specifies the trace object that defines the data and how it should be visualized.
f.add_trace(go.Scatter(
    x=ind_runs_progression.index,
    y=ind_runs_progression.values,
    mode="lines+markers",
    name="India"
))

f.add_trace(go.Scatter(
    x=usa_runs_progression.index,
    y=usa_runs_progression.values,
    mode="lines+markers",
    name="USA"
))

# update_layout : It is to modify various layout attributes of a Plotly figure. These attributes include the overall appearance and structure of the plot
f.update_layout(
    title   ="Runs Progression over Overs",
    xaxis_title ="Overs",
    yaxis_title="Cumulative Runs",
    legend_title="Teams",
    template="plotly_white"
)

f.show()

In [39]:
ind_wickets = data[(data["team"]=="India") & data["wickets_0_player_out"].notna()].groupby("over").size()

ind_wickets_count = ind_wickets.shape[0]
print(f"Wickets fallen of India: {ind_wickets_count}")

usa_wickets = data[(data["team"]=="United States of America") & data["wickets_0_player_out"].notna()].groupby("over").size()

usa_wickets_count = usa_wickets.shape[0]
print(f"Wickets fallen of USA: {usa_wickets_count}")

f=go.Figure()

f.add_trace(go.Bar(
    x=ind_wickets.index,
    y=ind_wickets.values,
    name="India",
    marker_color="blue",
    opacity=0.9
))

f.add_trace(go.Bar(
    x=usa_wickets.index,
    y=usa_wickets.values,
    name="USA",
    marker_color="red",
    opacity=0.9
))

f.update_layout(
    title="Wickets Timeline",
    xaxis_title="Overs",
    yaxis_title="Number of Wickets",
    barmode="group",
    template="plotly_white",
    legend_title="Teams"
)

f.show()

Wickets fallen of India: 3
Wickets fallen of USA: 7


In [40]:
import plotly.express as px

bar = px.bar(
    batter_stats,
    x=batter_stats.index,
    y="Runs",
    title="Run Distribution by Batsman",
    labels={"x":"Batsman","Runs":"Runs Scored"},
    template="plotly_white"
)

bar.update_layout(
    xaxis_title="Batter",
    yaxis_title="Runs Scored",
    # xaxis=dict(tickangle=90) is a concise way to specify that you want to rotate the x-axis tick labels by 90 degrees in a plot created with a Python plotting library like Plotly or Matplotlib
    xaxis=dict(tickangle=90)
)

bar.show()

In [41]:
f = go.Figure()

bowler_stats["Initials"] =  bowler_stats.index.str.split().str[0].str[0] + bowler_stats.index.str.split().str[-1].str[0]


f.add_trace(go.Scatter(
    x=bowler_stats["Economy Rate"],
    y=bowler_stats["Wickets"],
    mode="markers+text",
    text=bowler_stats["Initials"],
    textposition="top center",
    textfont=dict(
        family="sans serif",
        size=11,
        color="black"
    ),
    marker=dict(color="red", size=10),
    name="Bowlers"
))

f.update_layout(
    title="Bowling Performance",
    xaxis_title="Economy Rate",
    yaxis_title="Wickets Taken",
    template="plotly_white",
    autosize=False,
    width=850,
    height=600
)

f.show()

In [42]:
ind_partnership_data= data[data["team"]=="India"].groupby(["over","batter","non_striker"])["runs_total"].sum().reset_index()

# creating pivot table for better visualization
ind_partnership_pivot = ind_partnership_data.pivot(index="over", columns=["batter","non_striker"], values="runs_total").fillna(0)

# converting the pivot table to along format
ind_partnership_long = ind_partnership_pivot.reset_index().melt(id_vars="over", var_name=["batter","non_striker"], value_name="runs_total")

# create a stacked bar chart
f= go.Figure()

# adding bars for each patrnership
for (batter, non_striker) in ind_partnership_pivot.columns:
  partnership_data = ind_partnership_long[(ind_partnership_long["batter"] == batter) & (ind_partnership_long["non_striker"]==non_striker)]
  f.add_trace(go.Bar(
    x=partnership_data["over"],
    y=partnership_data["runs_total"],
    name=f'{batter} & {non_striker}'
))

f.update_layout(
      title="Partnership Contributions - INDIA",
      xaxis_title="Over",
      yaxis_title="Runs",
      barmode="stack",
      template="plotly_white",
      legend_title="Partnership",
      legend=dict(
          x=1.05,
          y=1,
          traceorder="normal",
          font=dict(size=10)
      ),
      autosize=False,
      width=900,
      height=600
)

f.show()

In [43]:
usa_partnership_data= data[data["team"]=="United States of America"].groupby(["over","batter","non_striker"])["runs_total"].sum().reset_index()

# creating pivot table for better visualization
usa_partnership_pivot = usa_partnership_data.pivot(index="over", columns=["batter","non_striker"], values="runs_total").fillna(0)

# converting the pivot table to along format
usa_partnership_long = usa_partnership_pivot.reset_index().melt(id_vars="over", var_name=["batter","non_striker"], value_name="runs_total")

# create a stacked bar chart
f= go.Figure()

# adding bars for each patrnership
for (batter, non_striker) in usa_partnership_pivot.columns:
  #  filters the DataFrame usa_partnership_long to select rows where both the "batter" column matches the value stored in the variable batter, and the "non_striker" column matches the value stored in the variable non_striker.
  partnership_data = usa_partnership_long[(usa_partnership_long["batter"] == batter) & (usa_partnership_long["non_striker"]==non_striker)]
  f.add_trace(go.Bar(
    x=partnership_data["over"],
    y=partnership_data["runs_total"],
    name=f'{batter} & {non_striker}'
))

f.update_layout(
      title="Partnership Contributions - USA",
      xaxis_title="Over",
      yaxis_title="Runs",

      # barmode="stack" is used in Plotly to stack bars of the same trace on top of each other within a group.
      barmode="stack",
      template="plotly_white",
      legend_title="Partnership",
      legend=dict(
          x=1.05,
          y=1,
          traceorder="normal",
          font=dict(size=10)
      ),
      autosize=False,
      width=900,
      height=600
)

f.show()

In [44]:
# cumulative runs for both teams by the over
#ind_cumulative_runs = data[data["team"] == "India"].groupby("over")["runs_total"].sum().cumsum()
usa_cumulative_runs = data[data["team"] == "United States of America"].groupby("over")["runs_total"].sum().cumsum()

# extracting key moments where wickets fell or significant runs were scored
#ind_key_moments = data[(data["team"]=="India")& data["wickets_0_player_out"].notna()]
usa_key_moments = data[(data["team"]=="United States of America")& data["wickets_0_player_out"].notna()]

#significant runs scored by India
ind_sig_runs = data[(data["team"]=="India") & (data["runs_total"]>=4)]

# significant runs scored by USa
usa_sig_runs = data[(data["team"]=="United States of America") & (data["runs_total"]>=4)]

usa_wickets_fall = data[(data["team"]=="United States of America") & data["wickets_0_player_out"].notna()].groupby("over").size().cumsum()

f=go.Figure()

f.add_trace(go.Scatter(
    x=usa_cumulative_runs.index,
    y=usa_cumulative_runs.values,
    mode="lines+markers",
    name="USA Cumulative Runs",
    line=dict(color="yellow")
))

f.add_trace(go.Scatter(
    x=usa_wickets_fall.index,
    y=usa_cumulative_runs.loc[usa_wickets_fall.index],
    mode="markers",
    name="USA Wickets",
    marker=dict(color="red", size=10)
))

# Adding annotations for key moments
# The iterrows() method in pandas iterates over rows of a DataFrame (usa_key_moments in this case), returning each row as a tuple containing the index and a pandas Series representing the row data.
for _, row in usa_key_moments.iterrows():
  f.add_annotation(
      x=row["over"],
      y=usa_cumulative_runs.loc[row["over"]],
      text=f'({row["batter"]}) ({row["over"]})',
      showarrow=True,
      arrowhead=3,
      ax=row["over"],
      ay=usa_cumulative_runs.loc[row["over"]],
      arrowcolor="black"
)

f.update_layout(
    title="USA Fall of Wickets in Innings",
    xaxis_title="Overs",
    yaxis_title="Cumulative Runs",
    template="plotly_white",
    legend_title="USA Innings",
    autosize=False,
    width=900,
    height=600
)

f.show()

In [45]:
india_cumulative_runs = data[data['team'] == 'India'].groupby('over')['runs_total'].sum().cumsum()
india_wickets_fall = data[(data['team'] == 'India') & data['wickets_0_player_out'].notna()].groupby('over').size().cumsum()
india_key_moments = data[(data['team'] == 'India') & data['wickets_0_player_out'].notna()].reset_index()

figure = go.Figure()

figure.add_trace(go.Scatter(
    x=india_cumulative_runs.index,
    y=india_cumulative_runs.values,
    mode='lines+markers',
    name='India Cumulative Runs',
    line=dict(color='blue')
))

figure.add_trace(go.Scatter(
    x=india_wickets_fall.index,
    y=india_cumulative_runs.loc[india_wickets_fall.index],
    mode='markers',
    name='India Wickets',
    # marker=dict(color='green', size=10) in Plotly is used to customize markers within traces, offering flexibility in visualizing and analyzing data points based on specific attributes such as color and size.
    marker=dict(color='green', size=10)
))

for _, row in india_key_moments.iterrows():
  # add_annotations : add text annotations to a figure. Annotations are text labels that can be placed at specified coordinates on a plot to provide additional context or information.
    figure.add_annotation(
        x=row['over'],
        y=india_cumulative_runs.loc[row['over']],
        text=f"{row['batter']} ({row['over']})",
        showarrow=True,
        arrowhead=3,
        ax=row['over'],
        ay=india_cumulative_runs.loc[row['over']] + 5,
        arrowcolor='black'
    )

figure.update_layout(
    title='India Fall of Wickets in Innings',
    xaxis_title='Overs',
    yaxis_title='Cumulative Runs',
    template='plotly_white',
    legend_title='India Innings',
    autosize=False,
    width=900,
    height=600
)

figure.show()

In [46]:
# Computing the run rates for the participant teams
ind_run_rate = data[data["team"] =="India"].groupby("over")["runs_total"].sum().mean()
usa_run_rate = data[data["team"] =="United States of America"].groupby("over")["runs_total"].sum().mean()

f=go.Figure()

f.add_trace(go.Bar(
    x=["India","USA"],
    y=[ind_run_rate, usa_run_rate],
    marker_color=["green","blue"]
))

f.add_annotation(
    x="India",
    y=ind_run_rate,
    text=f'{ind_run_rate:.2f}',
    showarrow=False,
    yshift=10
)

f.add_annotation(
    x="USA",
    y=usa_run_rate,
    text=f'{usa_run_rate:.2f}',
    showarrow=False,
    yshift=10
)

f.update_layout(
    title="Comparison of Average Run Rate per Over",
    xaxis_title="Team",
    yaxis_title="Average Run Rate per Over",
    template="plotly_white"
)

f.show()

In [47]:
ind_rrpo=data[data["team"]=="India"].groupby("over")["runs_total"].sum()
usa_rrpo=data[data["team"]=="United States of America"].groupby("over")["runs_total"].sum()

f=go.Figure()

f.add_trace(go.Scatter(
    x=ind_rrpo.index,
    y=ind_rrpo.values,
    mode="lines+markers",
    name="India Run Rate",
    line=dict(color="green")
))

f.add_trace(go.Scatter(
    x=usa_rrpo.index,
    y=usa_rrpo.values,
    mode="lines+markers",
    name="USA Run Rate",
    line=dict(color="blue")
))

f.update_layout(
    title="Comparison of Run Rate per Over",
    xaxis_title="Overs",
    yaxis_title="Runs",
    template="plotly_white",
    legend_title="Run Rate",
    width=1000,
    height=600
)
f.show()
