In [1]:
import os
import json
import pandas as pd


data_folder = r"C:\Users\hp\Desktop\all_male_json"

matches = []

df=pd.read_csv("match_results_dataset.csv")

In [2]:
df.dropna(inplace=True)
df.drop(columns="city",axis=1,inplace=True)

In [3]:
# Clean any extra whitespace
df['team1'] = df['team1'].astype(str).str.strip()
df['team2'] = df['team2'].astype(str).str.strip()


# Print actual team names to inspect
#print(sorted(set(df['team1']).union(set(df['team2']))))

# Define selected teams using exact strings from above
selected_teams = [
    "India", "Australia", "England", "New Zealand", "Pakistan",
    "South Africa", "Sri Lanka", "Bangladesh", "Afghanistan"
]

# Filter matches where BOTH teams are in the selected list
filtered_df = df[
    (df['team1'].isin(selected_teams)) & (df['team2'].isin(selected_teams))
].reset_index(drop=True)

#print(f"Filtered matches: {len(filtered_df)}")
#print(sorted(set(filtered_df['team1']) | set(filtered_df['team2'])))
# Show unexpected values
# Use only the filtered DataFrame!
print("Filtered team list:")
print(sorted(set(filtered_df['team1']) | set(filtered_df['team2'])))
filtered_df.to_csv("ven.csv",index="False")


Filtered team list:
['Australia', 'Bangladesh', 'England', 'India', 'New Zealand', 'Pakistan', 'South Africa', 'Sri Lanka']


In [4]:
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

**Encoding**

In [5]:
columns = ['team1', 'team2', 'venue',  'toss_winner', 'toss_decision', 'winner']
encoders = {}

In [6]:
for col in columns:
    le = LabelEncoder()
    filtered_df[col] = le.fit_transform(filtered_df[col])
    encoders[col] = le

In [7]:
filtered_df

Unnamed: 0,team1,team2,venue,toss_winner,toss_decision,winner
0,0,6,222,6,0,6
1,0,6,18,6,1,6
2,0,5,25,0,0,0
3,0,5,25,0,0,0
4,0,5,222,0,1,0
...,...,...,...,...,...,...
2092,1,3,181,3,1,3
2093,7,0,186,7,0,7
2094,7,0,149,0,1,0
2095,7,0,149,7,0,7


**Target and featrure**

In [8]:
y=filtered_df["winner"]
x=filtered_df.drop(columns="winner")

**Train test split**

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

**Model train**

In [10]:
model=RandomForestClassifier(n_estimators=100)

In [11]:
model.fit(x_train,y_train)

**test**

In [12]:
y_pred=model.predict(x_test)

In [13]:
acc=accuracy_score(y_test,y_pred)

In [14]:
acc

0.5785714285714286

In [15]:
acc*100

57.85714285714286

**Saving model**

In [16]:
from joblib import dump,load
dump({
    "model":model,
    "encoders":encoders
},r"C:\Users\hp\match_prediction.joblib")

['C:\\Users\\hp\\match_prediction.joblib']

In [17]:
test_input = {
    "team1": "India",
    "team2": "England",
    "venue": "Edgbaston",
    "toss_winner": "England",
    "toss_decision": "bat"
}

# Check if all values exist in encoders
for col in test_input:
    if test_input[col] not in encoders[col].classes_:
        raise ValueError(f"{test_input[col]} not in encoder for {col}")

# Encode
encoded_input = {col: encoders[col].transform([test_input[col]])[0] for col in test_input}
input_df = pd.DataFrame([encoded_input])

# Predict
winner_encoded = model.predict(input_df)[0]
winner = encoders["winner"].inverse_transform([winner_encoded])[0]
print("🏏 Predicted Winner:", winner)

🏏 Predicted Winner: India
