In [22]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pickle
import seaborn as sns


In [16]:
df = pd.read_csv('dataset.csv')

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106604 entries, 0 to 106603
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   batting_team       106604 non-null  object 
 1   bowling_team       106604 non-null  object 
 2   city               106604 non-null  object 
 3   runs_left          106604 non-null  int64  
 4   balls_left         106604 non-null  int64  
 5   wickets_remaining  106604 non-null  int64  
 6   total_run_x        106604 non-null  int64  
 7   crr                106604 non-null  float64
 8   rrr                106604 non-null  float64
 9   results            106604 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 8.1+ MB


In [18]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output=False,drop='first'),['batting_team','bowling_team','city'])
],
remainder='passthrough')

In [20]:
trf.get_feature_names_out()

array(['trf__batting_team_Delhi Capitals',
       'trf__batting_team_Gujarat Lions',
       'trf__batting_team_Gujarat Titans',
       'trf__batting_team_Kochi Tuskers Kerala',
       'trf__batting_team_Kolkata Knight Riders',
       'trf__batting_team_Lucknow Super Giants',
       'trf__batting_team_Mumbai Indians',
       'trf__batting_team_Pune Warriors',
       'trf__batting_team_Punjab Kings',
       'trf__batting_team_Rajasthan Royals',
       'trf__batting_team_Rising Pune Supergiant',
       'trf__batting_team_Rising Pune Supergiants',
       'trf__batting_team_Royal Challengers Bangalore',
       'trf__batting_team_Sunrisers Hyderabad',
       'trf__bowling_team_Delhi Capitals',
       'trf__bowling_team_Gujarat Lions',
       'trf__bowling_team_Gujarat Titans',
       'trf__bowling_team_Kochi Tuskers Kerala',
       'trf__bowling_team_Kolkata Knight Riders',
       'trf__bowling_team_Lucknow Super Giants',
       'trf__bowling_team_Mumbai Indians',
       'trf__bowling_team_P

In [21]:
df1 = pd.DataFrame(trf.fit_transform(df), columns=trf.get_feature_names_out())
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106604 entries, 0 to 106603
Data columns (total 69 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   trf__batting_team_Delhi Capitals               106604 non-null  float64
 1   trf__batting_team_Gujarat Lions                106604 non-null  float64
 2   trf__batting_team_Gujarat Titans               106604 non-null  float64
 3   trf__batting_team_Kochi Tuskers Kerala         106604 non-null  float64
 4   trf__batting_team_Kolkata Knight Riders        106604 non-null  float64
 5   trf__batting_team_Lucknow Super Giants         106604 non-null  float64
 6   trf__batting_team_Mumbai Indians               106604 non-null  float64
 7   trf__batting_team_Pune Warriors                106604 non-null  float64
 8   trf__batting_team_Punjab Kings                 106604 non-null  float64
 9   trf__batting_team_Rajasthan Royals   

In [25]:
ra_pipe = Pipeline([
    ('step1',trf),
    ('step2',RandomForestClassifier())
])

In [27]:
x = df.iloc[:, :-1]
y = df.iloc[:,-1]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=100)

In [28]:
ra_pipe.fit(x_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [29]:
ra_y_pred = ra_pipe.predict(x_test)

In [30]:
print(accuracy_score(y_test,ra_y_pred)*100 ,"% accuracy")

99.90244268507749 % accuracy


In [47]:
x_train[x_train['batting_team'] == 'Chennai Super Kings'].iloc[10]

batting_team         Chennai Super Kings
bowling_team                Punjab Kings
city                             Chennai
runs_left                             92
balls_left                            74
wickets_remaining                      8
total_run_x                          157
crr                                 8.48
rrr                                 7.46
Name: 67592, dtype: object

In [48]:
data = pd.DataFrame({
    'batting_team':['Chennai Super Kings'],
    'bowling_team':['Punjab Kings'],
    'city':['Chennai'],
    'runs_left':[92],
    'balls_left':[74],
    'wickets_remaining':[8],
    'total_run_x':[157],
    'crr':[8.48],
    'rrr':[7.46]
})

In [49]:
ra_pipe.predict_proba(data)

array([[0.97, 0.03]])

In [54]:
65/7

9.285714285714286

In [55]:
pickle.dump(ra_pipe,open('model.pkl','wb'))