In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [18]:
pip install openpyxl


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\user\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [19]:
# Step 1: Load Excel data
file_path = "Sports Data.xlsx"  # Ensure the file is in the same folder
xls = pd.ExcelFile(file_path)

In [20]:
# Step 2: Load the main data sheet
df = xls.parse("Sports data for DSBA")

In [21]:
# Step 3: Convert wrongly typed columns to numeric
df['Players_scored_zero'] = pd.to_numeric(df['Players_scored_zero'], errors='coerce')
df['player_highest_wicket'] = pd.to_numeric(df['player_highest_wicket'], errors='coerce')

In [22]:
# Step 4: Fill missing values
# Fill numeric columns with mean
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())


In [23]:
# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]))


In [24]:
# Step 5: Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [25]:
# Step 6: Prepare features and target
X = df.drop(['Game_number', 'Result'], axis=1)
# Save column order used in training
with open("feature_columns.txt", "w") as f:
    f.write("\n".join(X.columns))

y = df['Result']  # Encoded 1 = Win, 0 = Loss


In [26]:
# Step 7: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
# Step 8: Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [28]:
# Step 9: Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [29]:
print(f"✅ Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

✅ Model Accuracy: 95.73%

Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.76      0.86       104
           1       0.95      1.00      0.97       482

    accuracy                           0.96       586
   macro avg       0.98      0.88      0.92       586
weighted avg       0.96      0.96      0.95       586



In [30]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'cricket_win_predictor.pkl')
print("✅ Model saved as 'cricket_win_predictor.pkl'")


✅ Model saved as 'cricket_win_predictor.pkl'


In [31]:
import joblib
import pandas as pd

# Load the saved model
model = joblib.load('cricket_win_predictor.pkl')

# Example: Load a new match data row (replace with actual data)
new_match_data = pd.DataFrame([{
    'Avg_team_Age': 26.5,
    'Match_light_type': 1,   # Use the same label encoding as original
    'Match_format': 2,       # e.g., 0 = ODI, 1 = Test, 2 = T20
    'Bowlers_in_team': 4,
    'Wicket_keeper_in_team': 1,
    'All_rounder_in_team': 3,
    'First_selection': 0,    # e.g., 0 = Batting, 1 = Bowling
    'Opponent': 3,
    'Season': 1,
    'Audience_number': 50000,
    'Offshore': 0,
    'Max_run_scored_1over': 16,
    'Max_wicket_taken_1over': 2,
    'Extra_bowls_bowled': 1,
    'Min_run_given_1over': 2,
    'Min_run_scored_1over': 4,
    'Max_run_given_1over': 10,
    'extra_bowls_opponent': 0,
    'player_highest_run': 85,
    'Players_scored_zero': 1,
    'player_highest_wicket': 3
}])

# Predict match outcome
prediction = model.predict(new_match_data)
print("Predicted Match Result:", "Win" if prediction[0] == 1 else "Loss")


Predicted Match Result: Win


In [32]:
import pandas as pd
import joblib

# Load trained model and feature column order
model = joblib.load("cricket_win_predictor.pkl")
with open("feature_columns.txt") as f:
    feature_order = f.read().splitlines()

# Mappings (same as before)
match_format_map = {"Test": 1, "T20": 2, "ODI": 0}
match_light_map = {"Day": 0, "Night": 1, "Day and Night": 2}
first_selection_map = {"Batting": 0, "Bowling": 1}
season_map = {"Rainy": 0, "Winter": 1}
opponent_map = {"England": 0, "Australia": 1, "Sri Lanka": 2}

# Base values
base_features = {
    'Avg_team_Age': 26.0,
    'Wicket_keeper_in_team': 1,
    'All_rounder_in_team': 3,
    'Bowlers_in_team': 3,
    'First_selection': first_selection_map['Bowling'],
    'Audience_number': 50000,
    'Max_run_scored_1over': 14,
    'Max_wicket_taken_1over': 3,
    'Extra_bowls_bowled': 1,
    'Min_run_given_1over': 2,
    'Min_run_scored_1over': 4,
    'Max_run_given_1over': 10,
    'extra_bowls_opponent': 0,
    'player_highest_run': 75,
    'Players_scored_zero': 2,
    'player_highest_wicket': 3
}

# Match configs
matches = [
    {"Match_format": "Test", "Opponent": "England", "Match_light_type": "Day", "Season": "Rainy", "Offshore": "Yes"},
    {"Match_format": "T20", "Opponent": "Australia", "Match_light_type": "Day and Night", "Season": "Winter", "Offshore": "No"},
    {"Match_format": "T20", "Opponent": "Australia", "Match_light_type": "Day and Night", "Season": "Winter", "Offshore": "No"},
    {"Match_format": "ODI", "Opponent": "Sri Lanka", "Match_light_type": "Day and Night", "Season": "Winter", "Offshore": "No"},
    {"Match_format": "ODI", "Opponent": "Sri Lanka", "Match_light_type": "Day and Night", "Season": "Winter", "Offshore": "No"},
]

# Predict
print("🏏 Match Predictions\n-------------------")
for i, match in enumerate(matches, 1):
    features = base_features.copy()
    features.update({
        'Match_format': match_format_map[match['Match_format']],
        'Opponent': opponent_map[match['Opponent']],
        'Match_light_type': match_light_map[match['Match_light_type']],
        'Season': season_map[match['Season']],
        'Offshore': 1 if match['Offshore'] == "Yes" else 0
    })

    # Ensure correct order of features
    match_df = pd.DataFrame([features])[feature_order]

    # Predict
    prediction = model.predict(match_df)[0]
    result = "✅ WIN" if prediction == 1 else "❌ LOSS"
    print(f"Match {i}: vs {match['Opponent']} ({match['Match_format']}) -> {result}")


🏏 Match Predictions
-------------------


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
