# EDA(Exploratory Data Analysis) - group 11

## load packages

In [60]:
import numpy as np
import pandas as pd
import altair as alt

## load data

In [61]:
data = pd.read_csv("data/cricket_test.csv")
data = data.drop(columns = ["Unnamed: 0"])
data

Unnamed: 0,game_id,season,team,over,batter,batter_id,bowler,bowler_id,non_striker,non_striker_id,...,byes,wicket,player_out,player_out_id,fielders_name,fielders_id,wicket_type,runs_batter,runs_extras,runs_total
0,211028,2005,England,0,ME Trescothick,ea42ddb9,B Lee,dd09ff8e,GO Jones,2e929b99,...,0,0,,,,,,0,0,0
1,211028,2005,England,0,ME Trescothick,ea42ddb9,B Lee,dd09ff8e,GO Jones,2e929b99,...,0,0,,,,,,1,0,1
2,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,0,0,,,,,,0,0,0
3,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,0,0,,,,,,0,0,0
4,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,0,0,,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2085,255954,2006/07,India,19,KD Karthik,c03f1114,RJ Peterson,26ff4c29,SK Raina,1dc12ab9,...,0,0,,,,,,6,0,6
2086,255954,2006/07,India,19,KD Karthik,c03f1114,RJ Peterson,26ff4c29,SK Raina,1dc12ab9,...,0,0,,,,,,0,0,0
2087,255954,2006/07,India,19,KD Karthik,c03f1114,RJ Peterson,26ff4c29,SK Raina,1dc12ab9,...,0,0,,,,,,1,0,1
2088,255954,2006/07,India,19,SK Raina,1dc12ab9,RJ Peterson,26ff4c29,KD Karthik,c03f1114,...,0,0,,,,,,1,0,1


## Univariate Analysis

In [120]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2090 entries, 0 to 2089
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   game_id         2090 non-null   int64 
 1   season          2090 non-null   object
 2   team            2090 non-null   object
 3   over            2090 non-null   int64 
 4   batter          2090 non-null   object
 5   batter_id       2090 non-null   object
 6   bowler          2090 non-null   object
 7   bowler_id       2090 non-null   object
 8   non_striker     2090 non-null   object
 9   non_striker_id  2090 non-null   object
 10  wides           2090 non-null   int64 
 11  noballs         2090 non-null   int64 
 12  legbyes         2090 non-null   int64 
 13  byes            2090 non-null   int64 
 14  wicket          2090 non-null   int64 
 15  player_out      124 non-null    object
 16  player_out_id   124 non-null    object
 17  fielders_name   81 non-null     object
 18  fielders

In [121]:
data.describe()

Unnamed: 0,game_id,over,wides,noballs,legbyes,byes,wicket,runs_batter,runs_extras,runs_total
count,2090.0,2090.0,2090.0,2090.0,2090.0,2090.0,2090.0,2090.0,2090.0,2090.0
mean,230583.629665,9.051196,0.032057,0.009091,0.032536,0.007656,0.05933,1.203349,0.08134,1.284689
std,14784.55453,5.723798,0.213086,0.117472,0.239462,0.148194,0.236298,1.570518,0.366221,1.560279
min,211028.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,222678.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,226374.0,9.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,238195.0,14.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,255954.0,19.0,5.0,2.0,4.0,4.0,1.0,6.0,5.0,8.0


The dataset contains 22 columns and 2090 observations related to cricket. The ratio of quantitative to categorical (ordinal and nominal) data is approximately 50:50. Note that the columns **player_out, player_out_id, fielder_name, fielders_id, and wicket_type** show a significant amount of missing values. Player_out and player_out_id may be missing due to inapplicability (e.g., no one is out), which results in fielder_name, fielder_id, and wicket_type being invalid in these observations. Thus, these missing data are likely classified as **missing at random(MAR)**. Given the complexity of imputing these values, they are unlikely to be included in our further analysis.

In [119]:
def vis_bar(x_input):
    return alt.Chart(data).mark_bar().encode(
        x = x_input,
        y = "count()"
    ).properties(
        width = 150, 
        height = 150
    )

over = vis_bar("over")
wides = vis_bar("wides")
noballs = vis_bar("noballs")
legbyes = vis_bar("legbyes")
byes = vis_bar("byes")
wicket = vis_bar("wicket")
run_batter = vis_bar("runs_batter")
run_extras = vis_bar("runs_extras")
run_total = vis_bar("runs_total") 

v1 = alt.hconcat(over, wides, noballs)
v2 = alt.hconcat(legbyes, byes, wicket)
v3 = alt.hconcat(run_batter, run_extras, run_total)
alt.vconcat(v1, v2, v3)

In [131]:
season = vis_bar("season")
team = vis_bar("team")

alt.hconcat(season, team)

In [127]:
data["batter"].value_counts().reset_index()

Unnamed: 0,batter,count
0,GC Smith,155
1,ME Trescothick,98
2,SB Styris,85
3,RT Ponting,85
4,DR Martyn,77
...,...,...
89,MS Dhoni,2
90,KD Mills,1
91,MS Sinclair,1
92,M Zondeki,1


In [128]:
data["bowler"].value_counts().reset_index()

Unnamed: 0,bowler,count
0,SE Bond,66
1,B Lee,64
2,A Symonds,61
3,AR Adams,53
4,R Telemachus,52
...,...,...
64,JWM Dalrymple,12
65,TM Dilshan,12
66,LPC Silva,6
67,DPMD Jayawardene,6


In [129]:
data["non_striker"].value_counts().reset_index()

Unnamed: 0,non_striker,count
0,GC Smith,122
1,ME Trescothick,102
2,SB Styris,82
3,RT Ponting,80
4,DR Martyn,71
...,...,...
89,JWM Dalrymple,2
90,TT Bresnan,2
91,J Botha,1
92,DR Tuffey,1


Please be aware that all columns related to "ID" are not included in this analysis as they are primarily used for identification purposes and are unlikely to exhibit significant patterns. As observed, none of the quantitative attributes follow a normal distribution; the majority of them display right-skewed distributions. An interesting observation is found in the variable "Over" where the frequency decreases with increasing values, warranting further analysis. Regarding ordinal attributes, the dataset contains notably more data from the 2005/06 season compared to other seasons, as well as more teams from "New Zealand," "South Africa," and "Australia" than teams from other continents. Lastly, concerning nominal attributes, GC Smith appears to have considerably more observations than other players in both the "non-striker" and "batter" roles.

In general, the dataset we possess does not form a normal distribution, with samples unevenly distributed across categories. It is important to keep this in mind when training, testing, and interpreting the model, and to appropriately acknowledge this as a potential limitation.

## Multivariate Analysis - Correlation Analysis

In [106]:
corr_data = data[["over", "wides", "noballs", "legbyes", "byes", "wicket", "runs_batter", "runs_extras", "runs_total"]]
corr_df = corr_data.corr()
corr_df

Unnamed: 0,over,wides,noballs,legbyes,byes,wicket,runs_batter,runs_extras,runs_total
over,1.0,-0.09044,0.023514,-0.004708,-0.015135,0.08659,0.074299,-0.054284,0.062046
wides,-0.09044,1.0,-0.011648,-0.020451,-0.007775,-0.037792,-0.115327,0.561595,0.015731
noballs,0.023514,-0.011648,1.0,-0.01052,-0.004,0.01505,0.021111,0.305492,0.092954
legbyes,-0.004708,-0.020451,-0.01052,1.0,-0.007022,-0.034131,-0.104156,0.635758,0.044383
byes,-0.015135,-0.007775,-0.004,-0.007022,1.0,-0.012977,-0.0396,0.394259,0.052678
wicket,0.08659,-0.037792,0.01505,-0.034131,-0.012977,1.0,-0.192474,-0.04473,-0.204236
runs_batter,0.074299,-0.115327,0.021111,-0.104156,-0.0396,-0.192474,1.0,-0.14446,0.972655
runs_extras,-0.054284,0.561595,0.305492,0.635758,0.394259,-0.04473,-0.14446,1.0,0.089307
runs_total,0.062046,0.015731,0.092954,0.044383,0.052678,-0.204236,0.972655,0.089307,1.0


In [107]:
corr_ = corr_df.stack()
corr_ = corr_.reset_index()
corr_.columns = ['row', 'column', 'corr']
corr_

alt.Chart(corr_).mark_rect().encode(
    x = 'column',
    y = 'row',
    color = 'corr:Q',
    tooltip = 'corr:Q'
).properties(
    width = 400,
    height = 400
)

In [89]:
corr_table = corr_[corr_["row"] != corr_["column"]]
corr_table.sort_values("corr", ascending = False).head(15)

Unnamed: 0,row,column,corr
62,runs_batter,runs_total,0.972655
78,runs_total,runs_batter,0.972655
34,legbyes,runs_extras,0.635758
66,runs_extras,legbyes,0.635758
16,wides,runs_extras,0.561595
64,runs_extras,wides,0.561595
43,byes,runs_extras,0.394259
67,runs_extras,byes,0.394259
25,noballs,runs_extras,0.305492
65,runs_extras,noballs,0.305492


In [108]:
def vis_corr_scatter(x_input, y_input):
    return alt.Chart(corr_data).mark_point().encode(
        x = x_input,
        y = y_input
    ).properties(
        width = 150,
        height = 150
    )
    
batter_total_run_total = vis_corr_scatter("runs_batter:Q", "runs_total:Q")
legbyes_runs_extras = vis_corr_scatter("legbyes:Q", "runs_extras:Q")
wides_runs_extras = vis_corr_scatter("wides:Q", "runs_extras:Q")
byes_runs_extras = vis_corr_scatter("byes:Q", "runs_extras:Q")
noballs_runs_extras = vis_corr_scatter("noballs:Q", "runs_extras:Q")
alt.hconcat(batter_total_run_total, legbyes_runs_extras, wides_runs_extras, byes_runs_extras, noballs_runs_extras)

Among the quantitative attributes, the top 5 relationships in terms of strengths are:
1. **batter_total & run_total(r = 0.97)**
2. **legbyes & runs_extras(r = 0.64)**
3. **wides & runs_extra(r = 0.56**
4. **byes & runs_extra(r = 0.39)**
5. **noballs & run_extra(r = 0.31)**

Note that, using Pearson's r might not be able to capture non-linear relationships that potentially exist among these relationships. 