In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read datasets
matches = pd.read_csv("matches.csv")
deli = pd.read_csv("deliveries.csv")

# Question 1:

In [None]:
# all matches between DC and CSK
dc_csk = matches.query("team1.isin(['Delhi Daredevils', 'Delhi Capitals', 'Chennai Super Kings']) and team2.isin(['Delhi Daredevils', 'Delhi Capitals', 'Chennai Super Kings'])")

In [None]:
# victories of the two teams against each other 
dc_csk.winner.value_counts()

In [None]:
# matches of DC
dc = matches.query("team1.isin(['Delhi Daredevils', 'Delhi Capitals']) or team2.isin(['Delhi Daredevils', 'Delhi Capitals'])")

len(dc)

In [None]:
# count of winners in DC matches
dc.winner.value_counts()

In [None]:
# DC win ratio
(67+10)/177

In [None]:
# matches of CSK
csk = matches.query("team1 == 'Chennai Super Kings' or team2 == 'Chennai Super Kings'")

len(csk)

In [None]:
# count of winners in CSK matches
csk.winner.value_counts()

In [None]:
# CSK win ratio
(100)/194

## Explanation:

On the basis of the historical data, Chennai Super Kings has been a better performing team than Delhi Capitals. Hence, it can be predicted that Chennai Super Kings will win today's match.

# Question 2:

In [None]:
# all deliveries in CSK vs DC matches
csk_deli = deli.query("batting_team.isin(['Chennai Super Kings']) and bowling_team.isin(['Delhi Daredevils', 'Delhi Capitals'])")
print(len(csk_deli))

In [None]:
# total runs made in each CSK vs DC match
total_runs_match = csk_deli.groupby("match_id").sum()["total_runs"]

total_runs_match.mean()

In [None]:
# histogram
plt.hist(total_runs_match, bins=[min(total_runs_match), 150, 165, 175, max(total_runs_match)])
plt.title("Runs by CSK in matches against DC")
plt.xlabel("Runs")
plt.ylabel("Frequency")
plt.show()

In [None]:
# all deliveries of CSK
csk_deli = deli.query("batting_team.isin(['Chennai Super Kings'])")

len(csk_deli)

In [None]:
# average runs scored by CSK in a match
total_runs_csk = csk_deli.groupby("match_id").sum()["total_runs"]

total_runs_csk.mean()

In [None]:
# histogram
plt.hist(total_runs_csk, bins=[min(total_runs_csk), 150, 165, 175, max(total_runs_csk)])
plt.title("Runs by CSK in matches in IPL")
plt.xlabel("Runs")
plt.ylabel("Frequency")
plt.show()

## Explanation:

CSK mostly scores below 150 or more than 175 in IPL matches. However, against DC, it has scored above 175 in majority of the matches. Today's venue is Sharjah, which is a batting friendly pitch. However, CSK's performance has been below par in this season. Hence, it can be expected that CSK will score 166-175 runs in today's match.

# Question 3:

In [None]:
# all deliveries faced by DC
dc_bat = deli.query("batting_team.isin(['Delhi Daredevils', 'Delhi Capitals'])")

In [None]:
# DC deliveries between 16-20 overs
dc_16_20 = dc_bat.query("over.isin([16, 17, 18, 19, 20])")

In [None]:
# dc wickets between 16-20 overs
wickets_ipl = dc_16_20[dc_16_20.player_dismissed.notnull()].groupby("match_id").count().player_dismissed

In [None]:
# histogram
sns.countplot(wickets_ipl)
plt.title("Wickets lost per match by DC in IPL history in 16-20 overs")
plt.xlabel("No. of wickets")
plt.ylabel("Frequency")
plt.show()

In [None]:
# measures of central tendency
print(wickets_ipl.mean())
print(wickets_ipl.median())

In [None]:
# DC wickets between 16-20 overs against CSK
dc_16_20.query("bowling_team == 'Chennai Super Kings'").groupby("match_id").count().player_dismissed.value_counts()

# Explanation:

Wickets taken by CSK in the 16-20 overs is the same as the wickets lost by DC in 16-20 overs. Since the latter has more homogeneity, analysis makes more sense in that scenario. Based on historical data, CSK will take 0-2 wickets in today's match in the 16-20 overs. But allowing for the performance of the two teams in this season, it seems that 3-4 wickets of DC will fall down.

# Question 4:

In [None]:
# function to calculate balls taken to score a hundred
def get_balls_for_100(df):
    
    # empty list to store the number of balls
    num_balls = []

    # iterating over all the matches in the dataframe
    for m_id in df.match_id.unique():

        # getting the record of one match using the match id
        data = df[df.match_id == m_id]
        
        # calculating cumulative sum of the score
        cumsum = np.cumsum(data["total_runs"])
        
        # finding the index position of the ball on which the score of 100 is crossed
        end_pos = pd.DataFrame(cumsum >= 100).idxmax()[0]

        # dataframe which stores the details of the balls below the score of 100
        balls_taken_df = data.loc[:end_pos][:]

        # total balls taken
        total_balls = len(balls_taken_df)
        # extra balls like wide, no ball, etc. 
        extra_balls = len(balls_taken_df[balls_taken_df.extra_runs != 0])

        # final number of balls
        final_balls = total_balls - extra_balls
        num_balls.append(final_balls)
        
        # removing the cases where the score of 100 was not crossed
        final = [val for val in num_balls if val>1]
        
    return final

In [None]:
# balls taken to score a 100 considering all the matches of CSK
num = get_balls_for_100(csk_deli)

In [None]:
# histogram
plt.hist(num, bins=[0, 40, 60, 70, max(num)])
plt.title("Balls taken to score 100 runs")
plt.xlabel("No. of runs")
plt.ylabel("Frequency")
plt.show()

In [None]:
# balls taken to score a 100 considering all the matches of CSK against DC
num_dc = get_balls_for_100(csk_deli.query("bowling_team.isin(['Delhi Daredevils', 'Delhi Capitals'])"))

# histogram
plt.hist(num_dc, bins=[0, 40, 60, 70, max(num)])
plt.title("Balls taken to score 100 runs")
plt.xlabel("No. of runs")
plt.ylabel("Frequency")
plt.show()

## Explanation:

It will take CSK more than 70 balls to reach the score of 100 following the usual trend.

# Question 5:

In [None]:
# Total no. of no balls in IPL history
len(deli[deli.noball_runs != 0])

In [None]:
# Total no. of matches in IPL history
len(deli.match_id.unique())

In [None]:
# average no. of no balls per match
714/756

In [None]:
# finding out the no. of no balls bowled by CSK and DC
noballs_csk_dc = deli[deli.noball_runs != 0].query("bowling_team.isin(['Delhi Daredevils', 'Delhi Capitals', 'Chennai Super Kings'])")

In [None]:
# no balls bowled by CSK and DC
len(noballs_csk_dc)

In [None]:
# no. of matches in which CSK and DC have bowled
len(noballs_csk_dc.match_id.unique())

In [None]:
# average no. of no balls per match by CSK and DC
115/153

In [None]:
# no. of no balls per match by CSK and DC
noballs_csk_dc.groupby("match_id").count()["inning"].value_counts()

## Explanation:

Considering all the statistics, 0-1 no balls can be expected to be bowled in today's match.