In [3]:
import joblib
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [4]:
df = pickle.load(open('../data/final_data.pkl', 'rb'))

In [5]:
X = df.drop(columns='final_score')
y = df['final_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 1)

In [6]:
X_train

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five
53875,England,New Zealand,Auckland,41,90,9,8.200000,40.0
79256,South Africa,Australia,Cape Town,100,25,7,6.315789,33.0
27217,South Africa,India,Nottingham,59,68,8,6.807692,24.0
3717,India,Australia,Melbourne,67,75,10,8.933333,56.0
50963,Pakistan,Australia,Abu Dhabi,131,16,4,7.557692,37.0
...,...,...,...,...,...,...,...,...
63772,South Africa,West Indies,St George's,109,36,7,7.785714,29.0
79352,England,Pakistan,Dubai,79,53,7,7.074627,32.0
46551,South Africa,Pakistan,Lahore,96,39,3,7.111111,45.0
13807,Bangladesh,Pakistan,Pallekele,175,1,5,8.823529,42.0


In [7]:
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'),['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

In [8]:
trf

0,1,2
,transformers,"[('trf', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [9]:
pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', StandardScaler()),
    ('step3', XGBRegressor(n_estimators=1000, learning_rate=0.1, max_depth=12, random_state=1))
])

In [10]:
pipe

0,1,2
,steps,"[('step1', ...), ('step2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('trf', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.9873684048652649
1.7817789316177368


In [12]:
joblib.dump(pipe, '../model/model.pkl')

['../model/model.pkl']