In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

In [2]:

# Step 2: Load Data
try:
    match = pd.read_csv('matches_updated.csv')
    delivery = pd.read_csv('modified_deliveries.csv')
except FileNotFoundError as e:
    raise Exception(f"Error: {e}")

In [3]:
# Step 3: Preprocessing
# Drop columns that are not useful
columns_to_drop = ['match_id', 'date', 'player_of_match', 'venue', 'umpire1', 'umpire2', 'method']
match = match.drop(columns=columns_to_drop, axis=1)

In [4]:
# Handle `result_margin`
match['result_margin_value'] = pd.to_numeric(match['result_margin'], errors='coerce')  # Extract numeric part
match['result_margin_type'] = match['result_margin'].str.extract(r'(\D+)', expand=False).fillna('runs')  # Extract type
match.drop('result_margin', axis=1, inplace=True)

In [5]:
# Convert numerical columns to numeric and detect invalid values
numerical_features = ['result_margin_value', 'target_runs', 'target_overs']
for col in numerical_features:
    match[col] = pd.to_numeric(match[col], errors='coerce')
    print(f"After conversion, {col} has {match[col].isnull().sum()} NaN values.")
    

After conversion, result_margin_value has 17 NaN values.
After conversion, target_runs has 3 NaN values.
After conversion, target_overs has 3 NaN values.


In [6]:
# Inspect categorical features
categorical_features = ['season', 'city', 'match_type', 'team1', 'team2', 'toss_winner', 'toss_decision', 'result_margin_type']
print("\nCategorical columns:")
for col in categorical_features:
    print(f"{col} unique values: {match[col].nunique()}")


Categorical columns:
season unique values: 17
city unique values: 37
match_type unique values: 8
team1 unique values: 19
team2 unique values: 19
toss_winner unique values: 19
toss_decision unique values: 2
result_margin_type unique values: 2


In [7]:
# Check target variable
print("\nTarget variable (winner) unique values:", match['winner'].nunique())



Target variable (winner) unique values: 20


In [8]:
# Split features and target
X = match.drop('winner', axis=1)
y = match['winner']

In [9]:
# Step 4: Preprocessing Pipeline
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))  # Impute missing numerical values
])

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_features),
        ('cat', categorical_preprocessor, categorical_features)
    ]
)


In [10]:
# Step 5: Model Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Check for non-numeric values in numerical columns
for col in numerical_features:
    invalid_entries = match[pd.to_numeric(match[col], errors='coerce').isnull() & match[col].notnull()]
    if not invalid_entries.empty:
        print(f"Invalid values in column {col}:\n{invalid_entries}")


In [15]:
# GridSearchCV for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

try:
    grid_search.fit(X_train, y_train)
except ValueError as e:
    print(f"Error during GridSearchCV fitting: {e}")
    print("Check for NaN or unexpected values in your dataset.")


TypeError: '<' not supported between instances of 'str' and 'float'