In [1]:
import numpy as np
import pandas as pd
import pickle


In [2]:
df = pickle.load(open('dataset_level2.pkl', 'rb'))

In [3]:
df.head()

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,5,Australia,England,0.1,0,0,Sydney,Stadium Australia
1,5,Australia,England,0.2,0,0,Sydney,Stadium Australia
2,5,Australia,England,0.3,1,0,Sydney,Stadium Australia
3,5,Australia,England,0.4,1,0,Sydney,Stadium Australia
4,5,Australia,England,0.5,1,0,Sydney,Stadium Australia


In [4]:
# Features required
# batting_team
# bowling_team
# city
# current_score
# ball_left
# wickets_left
# current_rr


In [5]:
df[df['city'].isnull()]

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,41,New Zealand,Sri Lanka,0.1,0,0,,Pallekele International Cricket Stadium
1,41,New Zealand,Sri Lanka,0.2,0,0,,Pallekele International Cricket Stadium
2,41,New Zealand,Sri Lanka,0.3,3,0,,Pallekele International Cricket Stadium
3,41,New Zealand,Sri Lanka,0.4,0,0,,Pallekele International Cricket Stadium
4,41,New Zealand,Sri Lanka,0.5,0,0,,Pallekele International Cricket Stadium
...,...,...,...,...,...,...,...,...
120,962,Bangladesh,Pakistan,19.2,1,0,,Pallekele International Cricket Stadium
121,962,Bangladesh,Pakistan,19.3,0,Shakib Al Hasan,,Pallekele International Cricket Stadium
122,962,Bangladesh,Pakistan,19.4,1,0,,Pallekele International Cricket Stadium
123,962,Bangladesh,Pakistan,19.5,4,0,,Pallekele International Cricket Stadium


In [6]:
df[df['city'].isnull()]['venue'].value_counts()

Dubai International Cricket Stadium        3092
Pallekele International Cricket Stadium    2066
Melbourne Cricket Ground                   1453
Sydney Cricket Ground                       749
Adelaide Oval                               498
Harare Sports Club                          372
Sharjah Cricket Stadium                     249
Sylhet International Cricket Stadium        128
Carrara Oval                                 64
Name: venue, dtype: int64

In [7]:
df['venue'].str.split().apply(lambda x:x[0])

0      Stadium
1      Stadium
2      Stadium
3      Stadium
4      Stadium
        ...   
123       Eden
124       Eden
125       Eden
126       Eden
127       Eden
Name: venue, Length: 64741, dtype: object

In [8]:
cities = np.where(df['city'].isnull(), df['venue'].str.split().apply(lambda x:x[0]), df['city'])

In [9]:
df['city'] = cities

In [10]:
df.isnull().sum()

match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

In [11]:
df['city'].unique()

array(['Sydney', 'Johannesburg', 'Nelson', 'Southampton', 'Cape Town',
       'Dhaka', 'Guwahati', 'London', 'Barbados', 'Hamilton',
       'Wellington', 'Kolkata', 'Durban', 'Port Elizabeth', 'Chandigarh',
       'Manchester', 'Chittagong', 'Pallekele', 'Trinidad',
       'Visakhapatnam', 'Chattogram', 'Cardiff', 'Mirpur', 'St Lucia',
       'Dehradun', 'Colombo', 'Dubai', 'Adelaide', 'Antigua', 'Lahore',
       'Dominica', 'Lauderhill', 'Abu Dhabi', 'Auckland', 'Mumbai',
       'Indore', 'Bengaluru', 'Centurion', 'Chester-le-Street', 'Harare',
       'Hobart', 'Melbourne', 'Bangalore', 'Kandy', 'Guyana', 'Pune',
       'Bristol', 'Rajkot', 'Brisbane', 'Delhi', 'Kanpur', 'Lucknow',
       'Basseterre', 'Hambantota', 'Sylhet', 'Carrara', 'Birmingham',
       'Karachi', 'Sharjah', 'Mount Maunganui', 'King City', 'Perth',
       'Providence', 'Nottingham', 'Nagpur', 'Canberra', 'Ahmedabad',
       'Chennai', 'Nairobi', 'St Kitts', 'Christchurch', 'Hyderabad',
       'Victoria', 'Thiruvan

In [12]:
# filtering out cities where less than 5 matches were played (less than 600 bowls)
eligible_cities = df['city'].value_counts()[df['city'].value_counts() > 600].index.tolist()

In [13]:
df = df[df['city'].isin(eligible_cities)]


In [14]:
df['current_score'] = df.groupby('match_id').cumsum()['runs']

In [15]:
df.drop(columns=['venue'],inplace=True)


In [16]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score
0,5,Australia,England,0.1,0,0,Sydney,0
1,5,Australia,England,0.2,0,0,Sydney,0
2,5,Australia,England,0.3,1,0,Sydney,1
3,5,Australia,England,0.4,1,0,Sydney,2
4,5,Australia,England,0.5,1,0,Sydney,3
...,...,...,...,...,...,...,...,...
123,964,New Zealand,India,19.2,0,MJ Santner,Auckland,154
124,964,New Zealand,India,19.3,1,0,Auckland,155
125,964,New Zealand,India,19.4,1,0,Auckland,156
126,964,New Zealand,India,19.5,2,0,Auckland,158


In [17]:
df['over'] = df['ball'].apply(lambda x:str(x).split(".")[0])
df['ball_no'] = df['ball'].apply(lambda x:str(x).split(".")[1])
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no
0,5,Australia,England,0.1,0,0,Sydney,0,0,1
1,5,Australia,England,0.2,0,0,Sydney,0,0,2
2,5,Australia,England,0.3,1,0,Sydney,1,0,3
3,5,Australia,England,0.4,1,0,Sydney,2,0,4
4,5,Australia,England,0.5,1,0,Sydney,3,0,5
...,...,...,...,...,...,...,...,...,...,...
123,964,New Zealand,India,19.2,0,MJ Santner,Auckland,154,19,2
124,964,New Zealand,India,19.3,1,0,Auckland,155,19,3
125,964,New Zealand,India,19.4,1,0,Auckland,156,19,4
126,964,New Zealand,India,19.5,2,0,Auckland,158,19,5


In [18]:
df['balls_bowled'] = (df['over'].astype('int') * 6) + df['ball_no'].astype('int')
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled
0,5,Australia,England,0.1,0,0,Sydney,0,0,1,1
1,5,Australia,England,0.2,0,0,Sydney,0,0,2,2
2,5,Australia,England,0.3,1,0,Sydney,1,0,3,3
3,5,Australia,England,0.4,1,0,Sydney,2,0,4,4
4,5,Australia,England,0.5,1,0,Sydney,3,0,5,5
...,...,...,...,...,...,...,...,...,...,...,...
123,964,New Zealand,India,19.2,0,MJ Santner,Auckland,154,19,2,116
124,964,New Zealand,India,19.3,1,0,Auckland,155,19,3,117
125,964,New Zealand,India,19.4,1,0,Auckland,156,19,4,118
126,964,New Zealand,India,19.5,2,0,Auckland,158,19,5,119


In [19]:
df['balls_left'] = 120 - df['balls_bowled']
df['balls_left'] = df['balls_left'].apply(lambda x:0 if x<0 else x)
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left
0,5,Australia,England,0.1,0,0,Sydney,0,0,1,1,119
1,5,Australia,England,0.2,0,0,Sydney,0,0,2,2,118
2,5,Australia,England,0.3,1,0,Sydney,1,0,3,3,117
3,5,Australia,England,0.4,1,0,Sydney,2,0,4,4,116
4,5,Australia,England,0.5,1,0,Sydney,3,0,5,5,115
...,...,...,...,...,...,...,...,...,...,...,...,...
123,964,New Zealand,India,19.2,0,MJ Santner,Auckland,154,19,2,116,4
124,964,New Zealand,India,19.3,1,0,Auckland,155,19,3,117,3
125,964,New Zealand,India,19.4,1,0,Auckland,156,19,4,118,2
126,964,New Zealand,India,19.5,2,0,Auckland,158,19,5,119,1


In [20]:
df['player_dismissed'] = df['player_dismissed'].apply(lambda x:0 if x=='0' else 1)
df['player_dismissed'] = df['player_dismissed'].astype('int')
df['player_dismissed'] = df.groupby('match_id').cumsum()['player_dismissed']
df['wickets_left'] = 10 - df['player_dismissed']

In [21]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left
0,5,Australia,England,0.1,0,0,Sydney,0,0,1,1,119,10
1,5,Australia,England,0.2,0,0,Sydney,0,0,2,2,118,10
2,5,Australia,England,0.3,1,0,Sydney,1,0,3,3,117,10
3,5,Australia,England,0.4,1,0,Sydney,2,0,4,4,116,10
4,5,Australia,England,0.5,1,0,Sydney,3,0,5,5,115,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,964,New Zealand,India,19.2,0,7,Auckland,154,19,2,116,4,3
124,964,New Zealand,India,19.3,1,7,Auckland,155,19,3,117,3,3
125,964,New Zealand,India,19.4,1,7,Auckland,156,19,4,118,2,3
126,964,New Zealand,India,19.5,2,7,Auckland,158,19,5,119,1,3


In [22]:
df['crr'] = (df['current_score']*6)/df['balls_bowled']


In [23]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left,crr
0,5,Australia,England,0.1,0,0,Sydney,0,0,1,1,119,10,0.000000
1,5,Australia,England,0.2,0,0,Sydney,0,0,2,2,118,10,0.000000
2,5,Australia,England,0.3,1,0,Sydney,1,0,3,3,117,10,2.000000
3,5,Australia,England,0.4,1,0,Sydney,2,0,4,4,116,10,3.000000
4,5,Australia,England,0.5,1,0,Sydney,3,0,5,5,115,10,3.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,964,New Zealand,India,19.2,0,7,Auckland,154,19,2,116,4,3,7.965517
124,964,New Zealand,India,19.3,1,7,Auckland,155,19,3,117,3,3,7.948718
125,964,New Zealand,India,19.4,1,7,Auckland,156,19,4,118,2,3,7.932203
126,964,New Zealand,India,19.5,2,7,Auckland,158,19,5,119,1,3,7.966387


In [24]:
final_df = df.groupby('match_id').sum()['runs'].reset_index().merge(df, on='match_id')

In [25]:
final_df=final_df[['batting_team','bowling_team','city','current_score','balls_left','wickets_left','crr','runs_x']]

In [26]:
# shuffling for removing any bias
final_df = final_df.sample(final_df.shape[0])


In [27]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,runs_x
18166,New Zealand,South Africa,Durban,61,81,10,9.384615,151
28292,Pakistan,Australia,Dubai,36,86,9,6.352941,151
26863,Australia,England,Southampton,209,22,8,12.795918,248
11385,Afghanistan,India,St Lucia,34,75,7,4.533333,115
48148,New Zealand,Pakistan,Hamilton,28,98,9,7.636364,185
...,...,...,...,...,...,...,...,...
3113,New Zealand,England,St Lucia,0,116,10,0.000000,149
48611,South Africa,India,Johannesburg,179,12,7,9.944444,219
19203,England,South Africa,Nottingham,97,16,2,5.596154,111
7738,West Indies,Pakistan,Mirpur,84,30,5,5.600000,166


In [29]:
from sklearn.model_selection import train_test_split

X = final_df.drop(columns=['runs_x'])
y = final_df['runs_x']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [38]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0


In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error

In [40]:
transform = ColumnTransformer([
    ('trf', OneHotEncoder(sparse=False, drop='first'), 
     ['batting_team', 'bowling_team', 'city'])
],
remainder = 'passthrough')

In [41]:
pipe = Pipeline(steps=[
    ('step1',transform),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
])

In [42]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)


In [44]:
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R2 score:", r2_score(y_test, y_pred))


Mean Absolute Error: 2.476540298663434
R2 score: 0.9616993755584586


In [45]:
pickle.dump(pipe, open('xgboost_model.pkl', 'wb'))


In [29]:
for col in ['batting_team', 'bowling_team', 'city']:
    final_df[col] = final_df[col].astype('category')

In [31]:
X_train.dtypes

batting_team     category
bowling_team     category
city             category
current_score       int64
balls_left          int64
wickets_left        int64
crr               float64
dtype: object

In [46]:
import autosklearn.regression

automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=60*5, per_run_time_limit=60, 
    tmp_folder='/tmp/autosklearn_regression_t20_1st_innings_2')



In [47]:
automl.fit(X_train, y_train, dataset_name='t20_1st_innings')
print(automl.show_models())


[(0.540000, SimpleRegressionPipeline({'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'polynomial', 'regressor:__choice__': 'extra_trees', 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'minmax', 'feature_preprocessor:polynomial:degree': 2, 'feature_preprocessor:polynomial:include_bias': 'False', 'feature_preprocessor:polynomial:interaction_only': 'True', 'regressor:extra_trees:bootstrap': 'True', 'regressor:extra_trees:criterion': 'mse', 'regressor:extra_trees:max_depth': 'None', 'regressor:extra_trees:max_features': 0.9615263480351033, 'regressor:extra_trees:max_leaf_nodes': 'None', 'regressor:extra_trees:min_impurity_d

In [52]:
import sklearn.metrics 

predictions = automl.predict(X_test)
print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))
print("Mean Absolute Error:", sklearn.metrics.mean_absolute_error(y_test, predictions))

Mean Absolute Error: 4.89854557568769


In [53]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: t20_1st_innings
  Metric: r2
  Best validation score: 0.930694
  Number of target algorithm runs: 6
  Number of successful target algorithm runs: 2
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 4
  Number of target algorithms that exceeded the memory limit: 0



In [54]:
pickle.dump(automl, open('automl_model.pkl', 'wb'))


In [58]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[K     |████████████████████████████████| 173.5 MB 57 kB/s  eta 0:00:01    |██████████████▎                 | 77.7 MB 1.2 MB/s eta 0:01:18
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0


In [59]:
import xgboost as xgb
