In [None]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# reading datasets
matches = pd.read_csv("matches.csv")
deli = pd.read_csv("deliveries.csv")

# Question 1:

In [None]:
# all matches between CSK and MI
csk_mi = matches.query("team1.isin(['Chennai Super Kings', 'Mumbai Indians']) and team2.isin(['Chennai Super Kings', 'Mumbai Indians'])")

In [None]:
# victories of the two teams against each other 
csk_mi.winner.value_counts()

In [None]:
# matches of CSK
csk = matches.query("team1.isin(['Chennai Super Kings']) or team2.isin(['Chennai Super Kings'])")

len(csk)

In [None]:
# count of winners in CSK matches
csk.winner.value_counts()

In [None]:
# CSK win ratio
100/164

In [None]:
# matches of MI
mi = matches.query("team1 == 'Mumbai Indians' or team2 == 'Mumbai Indians'")

len(mi)

In [None]:
# counts of winners in RR matches
mi.winner.value_counts()

In [None]:
# MI win ratio
109/187

# Question 2:

In [None]:
# all deliveries faced by De Kock
de_kock = deli[deli.batsman == "Q de Kock"]

In [None]:
# empty list to store runs
runs = []

# iterating over all the matches played by him
for m_id in de_kock.match_id.unique():
    
    # data on each match
    match = de_kock[de_kock.match_id == m_id]
    
    # condition to check if he has played more than 10 balls
    if len(match) >=10:
        data = match.iloc[:10]               # getting the 1st 10 balls   
    else:
        data = match.copy()            # getting entire dataframe in case he did not play 10 balls
    
    # exception handling to handle the cases when he did not score a boundary in the 1st 10 balls
    try:
        # getting all the boundary runs
        r = data[data.batsman_runs.isin([4, 6])].batsman_runs.sum()
    except:
        # 0 in case no boundaries
        r = 0
    
    # add runs to the list
    runs.append(r)

In [None]:
# runs in 50 matches
len(runs)

In [None]:
# histogram
plt.hist(runs, bins=[0, 10, 15, 20, max(runs)])
plt.title("Runs scored off boundaries in the 1st 10 balls")
plt.xlabel("Runs")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.hist(runs[-16:], bins=[0, 10, 15, 20, max(runs)]);
plt.title("Runs scored off boundaries in the 1st 10 balls in 2019")
plt.xlabel("Runs")
plt.ylabel("Frequency")
plt.show()

# Question 3:

In [None]:
# bowlers of MI
bowlers = ['JJ Bumrah', 'RD Chahar', 'TA Boult', 'KH Pandya', 'HH Pandya', 'KA Pollard', 'NM Coulter-Nile']

In [None]:
# function to get the ratio of dot balls
def get_dots_ratio(bowlers):
    
    # dictionary to store values corresponding to each of the bowlers
    dot_ratio = {}
    
    # iterating over all the bowlers
    for b in bowlers:
        print(b)           # print bowler name
        d = deli[deli.bowler == b]       # all deliveries of that bowler
        print(len(d.match_id.unique()))      # the number of matches played by him
        
        ratio = []        # empty list to store the ratio of dot balls in each match
        
        # iterating over all the matches of that bowler
        for m_id in d.match_id.unique():
            data = d[d.match_id == m_id]       # data on each match 
            balls = len(data)           # total balls bowled in the match
            dots = len(data[data.total_runs == 0])          # all dot balls
            ratio.append(dots/balls)        # ratio
            
        dot_ratio.update({b:ratio})        # updating the dictionary with the information of that bowler
    
    return dot_ratio       # return the dictionary
    

In [None]:
# find the ratio of dot balls for each of the bowlers
dot_ratio = get_dots_ratio(bowlers)

In [None]:
# measures of central tendency
for key in dot_ratio.keys():
    print(key)                         # batsman
    print(np.median(dot_ratio[key]))      # median score
    print(np.mean(dot_ratio[key]))        # mean score

# Question 4:

In [None]:
def get_overs(df):
    c = []

    for m_id in df.match_id.unique():
        count = 0

        d = df[df.match_id == m_id]
        for over in d.over.unique():

            data = d[d.over == over]

            if 4 in data.batsman_runs.values and 6 in data.batsman_runs.values:
                count = count+1
        c.append(count)
    return c

In [None]:
mi_deli = deli[deli.batting_team == "Mumbai Indians"]

In [None]:
plt.hist(get_overs(mi_deli))
plt.title("Overs with both a 4 and a 6")
plt.xlabel("No. of overs")
plt.ylabel("Frequency")
plt.show()

In [None]:
np.median(get_overs(mi_deli))

In [None]:
csk_deli = deli[deli.batting_team == "Chennai Super Kings"]

In [None]:
plt.hist(get_overs(csk_deli))
plt.title("Overs with both a 4 and a 6")
plt.xlabel("No. of overs")
plt.ylabel("Frequency")
plt.show()

In [None]:
np.median(get_overs(csk_deli))

# Question 5:

In [None]:
def get_bls(bowlers):
    bls_bowlers = {}
    
    for b in bowlers:
        print(b)
        score = []
        d = deli[deli.bowler == b]
        print(len(d.match_id.unique()))
        for m_id in d.match_id.unique():
            
            data = d[d.match_id == m_id]
            balls = len(data)
            dots = len(data[data.total_runs == 0])
            wickets = len(data[data.player_dismissed.notnull()])
            boundaries = len(data[data.batsman_runs.isin([4, 6])])
            
            bls = (wickets+dots-boundaries)/balls
            
        score.append(bls)
        
        bls_bowlers.update({b:score})
    
    return bls_bowlers

In [None]:
bowlers = ['JJ Bumrah', 'RD Chahar', 'TA Boult', 'KH Pandya', 'HH Pandya', 'KA Pollard', 'NM Coulter-Nile',
          'DL Chahar', 'RA Jadeja', 'SN Thakur', 'S Curran', 'PP Chawla']

In [None]:
result = get_bls(bowlers)

In [None]:
# measures of central tendency
for key in result.keys():
    print(key)                         # batsman
    print(np.median(result[key]))      # median score
    print(np.mean(result[key]))        # mean score