In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#**matches.csv**

##Data Understanding

In [None]:
matches=pd.read_csv('/content/matches.csv')

In [None]:
matches

In [None]:
matches.head()

In [None]:
matches.tail()

In [None]:
matches.info()

In [None]:
matches.shape

In [None]:
matches.dtypes

In [None]:
matches.isna()

In [None]:
matches.isna().sum()

In [None]:
matches['city'] = matches['city'].fillna('Unknown')
matches['result_margin'] = matches['result_margin'].fillna(0)
matches['target_runs'] = matches['target_runs'].fillna(0)
matches['target_overs'] = matches['target_overs'].fillna(0)
matches['player_of_match'] = matches['player_of_match'].fillna('None')
matches['winner']=matches['winner'].fillna('Draw')
matches['method']=matches['method'].fillna('None')

In [None]:
matches.isna().sum()

In [None]:
matches.describe()

In [None]:
matches.duplicated().sum()

In [None]:
matches.nunique()

In [None]:
pd.value_counts(matches['season'])

##EDA

In [None]:
categorical_cols = matches.select_dtypes(include='object').columns
for col in categorical_cols:
    print(f"\n🔹 Value Counts: {col}")
    print(matches[col].value_counts())

In [None]:
# Histograms for numerical features
matches.hist(bins=30, figsize=(18, 15), color='skyblue')
plt.suptitle("Histograms of Numerical Features", fontsize=16)
plt.show()

In [None]:
# Bar plots for categorical variables
for col in ['match_type', 'toss_decision', 'result', 'super_over']:
    sns.countplot(x=col, data=matches)
    plt.title(f"Count Plot of {col}")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Correlation matrix
corr_matrix = matches.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

##Handling Outliers

In [None]:
numeric_cols = matches.select_dtypes(include='number').columns

# Plot boxplots for each numerical column in a grid
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols*5, n_rows*4))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(y=matches[col])
    plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
def iqr_trim(s, k=1.5):
    # loop until nothing changes
    while True:
        q1, q3 = s.quantile([0.25, 0.75])
        iqr     = q3 - q1
        lb, ub  = q1 - k*iqr, q3 + k*iqr
        new_s   = s.where((s >= lb) & (s <= ub))
        if new_s.equals(s):          # nothing else got removed
            return new_s.dropna()    # done
        s = new_s.dropna()


In [None]:
ID = matches['id']
ID = pd.DataFrame(ID)
ID

In [None]:
matches.drop(['id'], axis=1, inplace=True)
matches

##Encoding

In [None]:
matches.head()

###Frequency Encoding

In [None]:
city_freq = matches['city'].value_counts().to_dict()
matches['city'] = matches['city'].map(city_freq)
player_of_match_freq = matches['player_of_match'].value_counts().to_dict()
matches['player_of_match'] = matches['player_of_match'].map(player_of_match_freq)
umpire1_freq = matches['umpire1'].value_counts().to_dict()
matches['umpire1'] = matches['umpire1'].map(umpire1_freq)
umpire2_freq = matches['umpire2'].value_counts().to_dict()
matches['umpire2'] = matches['umpire2'].map(umpire2_freq)
venue_freq = matches['venue'].value_counts().to_dict()
matches['venue'] = matches['venue'].map(venue_freq)
date_freq = matches['date'].value_counts().to_dict()
matches['date'] = matches['date'].map(date_freq)

In [None]:
matches.head()

###Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(matches['winner'])


In [None]:
matches['team1']=le.fit_transform(matches['team1'])
matches['team2']=le.fit_transform(matches['team2'])
matches['toss_winner']=le.fit_transform(matches['toss_winner'])
matches['match_type']=le.fit_transform(matches['match_type'])
matches['winner']=le.fit_transform(matches['winner'])
matches['result']=le.fit_transform(matches['result'])

In [None]:
matches.head()

###One Hot Encoding

In [None]:
matches=pd.get_dummies(matches,dtype=int)

In [None]:
matches.head()

##Scaling

In [None]:
matches.shape

In [None]:
matches.corr()

In [None]:
matches.describe()

In [None]:
matches.nunique()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [None]:
matches['season']=scaler.fit_transform(matches[['season']])
matches['city']=scaler.fit_transform(matches[['city']])
matches['match_type']=scaler.fit_transform(matches[['match_type']])
matches['player_of_match']=scaler.fit_transform(matches[['player_of_match']])
matches['venue']=scaler.fit_transform(matches[['venue']])
matches['team1']=scaler.fit_transform(matches[['team1']])
matches['team2']=scaler.fit_transform(matches[['team2']])
matches['toss_winner']=scaler.fit_transform(matches[['toss_winner']])
matches['result_margin']=scaler.fit_transform(matches[['result_margin']])
matches['target_runs']=scaler.fit_transform(matches[['target_runs']])
matches['target_overs']=scaler.fit_transform(matches[['target_overs']])
matches['umpire1']=scaler.fit_transform(matches[['umpire1']])
matches['umpire2']=scaler.fit_transform(matches[['umpire2']])

In [None]:
matches.describe()

In [None]:
matches.head()

##Train Test Split

In [None]:
features = ['season', 'city', 'match_type', 'player_of_match', 'venue',
            'team1', 'team2', 'toss_winner', 'toss_decision_bat','toss_decision_field', 'umpire1', 'umpire2']
X = matches[features]
y = matches['winner']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


###Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report, confusion_matrix
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
results = []

results.append(("Logistic Regression", acc_lr, None, lr))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\n=== Classification Report (Logistic Regression) ===")
print(classification_report(y_test, y_pred_lr))


###Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
acc_rf = accuracy_score(y_test, rf.predict(X_test))
y_pred_rf = rf.predict(X_test)
print("\n=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))



###SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train SVM model
svm = SVC()
svm.fit(X_train, y_train)

# Predict
y_pred_svm = svm.predict(X_test)

# Accuracy
acc_svm = accuracy_score(y_test, y_pred_svm)
results.append(("SVM", acc_svm, None, svm))

# Output
print("\n=== Support Vector Machine ===")
print("Accuracy:", acc_svm)
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))


###KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
results.append(("KNN", acc_knn, None, knn))
print("\n=== K-Nearest Neighbors ===")
print("Accuracy:", acc_knn)
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))


###Navie Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict
y_pred_nb = nb.predict(X_test)

# Accuracy
acc_nb = accuracy_score(y_test, y_pred_nb)
results.append(("Naive Bayes", acc_nb, None, nb))

# Output
print("\n=== Naive Bayes ===")
print("Accuracy:", acc_nb)
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))


###XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train the XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

# Predict
y_pred_gb = model.predict(X_test)

# Accuracy
acc_gb = accuracy_score(y_test, y_pred_gb)
results.append(("XGBoost", acc_gb, None, model))

# Output
print("\n=== Gradient Boosting ===")
print("Accuracy:", acc_gb)
print("Classification Report:")
print(classification_report(y_test, y_pred_gb))

###Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predict
y_pred_dt = dt.predict(X_test)

# Accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
results.append(("Decision Tree", acc_dt, None, dt))

# Output
print("\n=== Decision Tree ===")
print("Accuracy:", acc_dt)
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))



###Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train Gradient Boosting model
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbc.fit(X_train, y_train)

# Predict
y_pred_gbc = gbc.predict(X_test)

# Accuracy
acc_gbc = accuracy_score(y_test, y_pred_gbc)
results.append(("Gradient Boosting", acc_gbc, None, gbc))

# Output
print("\n=== Gradient Boosting Classifier ===")
print("Accuracy:", acc_gbc)
print("Classification Report:")
print(classification_report(y_test, y_pred_gbc))



In [None]:
unique_results = {}
for name, acc, _, model in results:
    unique_results[name] = (acc, model)
print("=== Model Accuracy Comparison ===")
print(f"{'Model':<20} {'Accuracy':<10}")
for name, (acc, _) in unique_results.items():
    print(f"{name:<20} {acc:<10.4f}")


##Data Visualization

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

models = {
    "XGBoost": model,
    "Gradient Boosting": gbc
}

xgb_pred = models["XGBoost"].predict(X_test)
gb_pred = models["Gradient Boosting"].predict(X_test)

fig, axs = plt.subplots(1, 2, figsize=(12, 5))

ConfusionMatrixDisplay.from_predictions(y_test, xgb_pred, ax=axs[0], cmap='Blues')
axs[0].set_title("XGBoost Confusion Matrix")

ConfusionMatrixDisplay.from_predictions(y_test, gb_pred, ax=axs[1], cmap='Greens')
axs[1].set_title("Gradient Boosting Confusion Matrix")

plt.tight_layout()
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
model = XGBClassifier(eval_metric='mlogloss', random_state=42)
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", scores)
print("Mean Accuracy:", np.mean(scores))
print("Standard Deviation:", np.std(scores))


##HPT

In [None]:
# 📦 Import libraries
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

# ✅ Step 1: Prepare your dataset (replace X and y with your data)
# For example:
# X = your_features_dataframe
# y = your_target_series

# 🧪 Step 2: Use small subset for tuning to reduce memory usage
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.1, stratify=y, random_state=42)

# 🧠 Step 3: Define lightweight XGBoost model
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    tree_method='hist',     # Fast and memory-efficient
    n_jobs=1,               # Avoid using all CPU cores (prevents crashes)
    random_state=42
)

# 🔧 Step 4: Simplified hyperparameter search space
param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.2],
    'reg_alpha': [0, 1],
    'reg_lambda': [0, 1],
}

# 🔍 Step 5: RandomizedSearchCV (reduced iterations and CV folds)
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=10,             # Fewer combinations = faster
    scoring='accuracy',
    cv=2,                  # Less memory than 5-fold
    verbose=1,
    random_state=42,
    n_jobs=1               # Single-threaded to avoid memory spike
)

# 🏃 Step 6: Fit on sample dataset (no early_stopping needed on tiny set)
random_search.fit(X_train, y_train)

# ✅ Step 7: Output best results
print("✅ Best Parameters:", random_search.best_params_)
print("✅ Best Accuracy on Sample:", random_search.best_score_)

# 🏁 Step 8: Train final model on full data using best params
best_model = random_search.best_estimator_
best_model.fit(X, y)

# 💾 Step 9: Save final model
joblib.dump(best_model, "xgb_best_model.pkl")
print("✅ Model saved as 'xgb_best_model.pkl'")


##Prediction

In [None]:
city_freq = matches['city'].value_counts(normalize=True).to_dict()
player_of_match_freq = matches['player_of_match'].value_counts(normalize=True).to_dict()
umpire1_freq = matches['umpire1'].value_counts(normalize=True).to_dict()
umpire2_freq = matches['umpire2'].value_counts(normalize=True).to_dict()
venue_freq = matches['venue'].value_counts(normalize=True).to_dict()
date_freq = matches['date'].value_counts(normalize=True).to_dict()


In [None]:
import joblib

joblib.dump(city_freq, "city_freq.pkl")
joblib.dump(player_of_match_freq, "player_of_match_freq.pkl")
joblib.dump(umpire1_freq, "umpire1_freq.pkl")
joblib.dump(umpire2_freq, "umpire2_freq.pkl")
joblib.dump(venue_freq, "venue_freq.pkl")
joblib.dump(date_freq, "date_freq.pkl")


In [None]:
city_freq = joblib.load("city_freq.pkl")
player_of_match_freq = joblib.load("player_of_match_freq.pkl")
umpire1_freq = joblib.load("umpire1_freq.pkl")
umpire2_freq = joblib.load("umpire2_freq.pkl")
venue_freq = joblib.load("venue_freq.pkl")
date_freq = joblib.load("date_freq.pkl")


In [None]:
import joblib

best_xgb_model = joblib.load("xgb_best_model.pkl")


In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Assuming y contains the winner team names
winner_encoder = LabelEncoder()
y_encoded = winner_encoder.fit_transform(y)  # `y` = match['winner']
joblib.dump(winner_encoder, 'winner_encoder.pkl')


In [None]:
import pandas as pd
import joblib

# Load the label encoder for the 'winner' column
try:
    winner_encoder = joblib.load('winner_encoder.pkl')
except FileNotFoundError:
    print("Error: 'winner_encoder.pkl' not found. Please ensure the encoder was saved correctly.")

    exit()

new_match = {
    'season': 2023,
    'city': 'Delhi',
    'date': '2023-04-15',
    'match_type': 'T20',
    'player_of_match': 'David Warner',
    'venue': 'Arun Jaitley Stadium',
    'team2': 'Delhi Capitals',
    'team1': 'Mumbai Indians',
    'toss_winner': 'Mumbai Indians',
    'result': 'normal',
    'result_margin': 5,
    'target_runs': 175,
    'target_overs': 20,
    'umpire1': 'Chris Gaffaney',
    'umpire2': 'Nitin Menon',
    'toss_decision_bat': 0,
    'toss_decision_field': 1,
    'super_over_N': 1,
    'super_over_Y': 0,
    'method_D/L': 0,
    'method_None': 1
}


new_df = pd.DataFrame([new_match])

new_df['city'] = new_df['city'].map(city_freq).fillna(0)
new_df['player_of_match'] = new_df['player_of_match'].map(player_of_match_freq).fillna(0)
new_df['umpire1'] = new_df['umpire1'].map(umpire1_freq).fillna(0)
new_df['umpire2'] = new_df['umpire2'].map(umpire2_freq).fillna(0)
new_df['venue'] = new_df['venue'].map(venue_freq).fillna(0)
new_df['date'] = new_df['date'].map(date_freq).fillna(0)


new_df_encoded = pd.get_dummies(new_df,dtype=int)
missing_cols = set(X.columns) - set(new_df_encoded.columns)
for col in missing_cols:
    new_df_encoded[col] = 0
new_df_encoded = new_df_encoded[X.columns]

y_pred = best_xgb_model.predict(new_df_encoded)
probs = best_xgb_model.predict_proba(new_df_encoded)[0]
predicted_team = winner_encoder.inverse_transform(y_pred)[0]

team1 = new_match['team1']
team2 = new_match['team2']
label_classes = list(winner_encoder.classes_)

if predicted_team == team1 or predicted_team == team2:
    print("Winning Team:", predicted_team)
else:
    try:
        team1_index = label_classes.index(team1)
    except ValueError:
        team1_index = None

    try:
        team2_index = label_classes.index(team2)
    except ValueError:
        team2_index = None

    if team1_index is not None and team2_index is not None:

        winning_team = team1 if probs[team1_index] > probs[team2_index] else team2
        print("Winning Team:", winning_team)
    else:
        winning_team =team2
        print("Winning Team:", winning_team)


##Deep Learning

In [None]:
!pip install tensorflow


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# 1. Load the data
matches = pd.read_csv("matches.csv")

# 2. Drop rows with missing values in required columns
matches.dropna(subset=['team1', 'team2', 'toss_winner', 'toss_decision',
                       'winner', 'venue', 'season', 'city'], inplace=True)

# 3. Group rare teams as "Other"
win_counts = matches['winner'].value_counts()
rare_teams = win_counts[win_counts < 20].index
matches['winner_grouped'] = matches['winner'].apply(lambda x: 'Other' if x in rare_teams else x)

# 4. Define features and target
features = ['team1', 'team2', 'toss_winner', 'toss_decision', 'venue', 'season', 'city']
X = matches[features]
y = matches['winner_grouped']

# 5. Encode target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

# 6. One-hot encode categorical features
column_transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), features)
], sparse_threshold=0)

X_encoded = column_transformer.fit_transform(X)

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 8. Compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

# 9. Build the model
model = Sequential([
    Dense(256, input_shape=(X_encoded.shape[1],), activation='relu'),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer=Adam(0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 10. Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    verbose=1
)

# 11. Evaluate on test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\n✅ Final Test Accuracy: {accuracy:.4f} - Loss: {loss:.4f}")

# 12. Classification report
y_pred = np.argmax(model.predict(X_test), axis=1)
print("\n📋 Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# 13. Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(xticks_rotation='vertical', cmap='Blues')
plt.title("Confusion Matrix: Match Winner Prediction")
plt.tight_layout()
plt.show()


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"✅ Matches.csv - Test Accuracy: {accuracy:.2f}")

#**deliveries.csv**

##Data Understanding

In [None]:
deliveries=pd.read_csv('/content/deliveries.csv')

In [None]:
deliveries

In [None]:
deliveries.head()

In [None]:
deliveries.tail()

In [None]:
deliveries.info()

In [None]:
deliveries.shape

In [None]:
deliveries.dtypes

In [None]:
deliveries.isna().sum()

In [None]:
deliveries.duplicated().sum()

In [None]:
deliveries.describe()

In [None]:
deliveries.nunique()

In [None]:
columns_to_drop = [
    'Unnamed: 0', 'Match ID', 'Date', 'Venue',
    'Bat First', 'Bat Second', 'Innings', 'Over', 'Ball', 'Winner', 'Chased Successfully',
    'Non Striker', 'Total Non Striker Runs', 'Non Striker Balls Faced',
    'Extra Runs', 'Extra Type', 'Ball Rebowled', 'Runs From Ball',
    'Player Out', 'Method', 'Player Out Balls Faced', 'Player Out Runs',
    'Innings Runs', 'Innings Wickets', 'Target Score', 'Runs to Get', 'Balls Remaining'
]
deliveries = deliveries.drop(columns=columns_to_drop)

In [None]:
print(deliveries.columns.tolist())

In [None]:
deliveries.info()

In [None]:

batter_stats = deliveries.groupby('Batter').agg({
    'Total Batter Runs': 'sum',
    'Batter Balls Faced': 'sum'
}).reset_index()
batter_stats['Strike Rate'] = (batter_stats['Total Batter Runs'] / batter_stats['Batter Balls Faced']) * 100
batter_stats = batter_stats.rename(columns={'Batter': 'Player'})
balls_bowled_df = deliveries.groupby('Bowler').size().reset_index(name='Balls Bowled')
balls_bowled_df['Overs Bowled'] = balls_bowled_df['Balls Bowled'] / 6
balls_bowled_df = balls_bowled_df.rename(columns={'Bowler': 'Player'})
bowler_stats = deliveries.groupby('Bowler').agg({
    'Wicket': 'sum',
    'Bowler Runs Conceded': 'sum'
}).reset_index().rename(columns={'Bowler': 'Player'})
bowler_stats = pd.merge(bowler_stats, balls_bowled_df, on='Player', how='left')
bowler_stats['Economy Rate'] = bowler_stats['Bowler Runs Conceded'] / bowler_stats['Overs Bowled']
player = pd.merge(batter_stats, bowler_stats, on='Player', how='outer')




In [None]:
player.info()

In [None]:
def classify_balanced_v2(row):
    # Batting thresholds
    good_batter = row['Total Batter Runs'] >= 500 and row['Strike Rate'] >= 115
    avg_batter = row['Total Batter Runs'] >= 200 and row['Strike Rate'] >= 105

    # Bowling thresholds
    is_bowler = row['Balls Bowled'] >= 150 and row['Wicket'] >= 3
    good_bowler = row['Wicket'] >= 10 and row['Economy Rate'] <= 8.5 if is_bowler else False
    avg_bowler = row['Wicket'] >= 5 and row['Economy Rate'] <= 9.5 if is_bowler else False


    if good_batter:
        return 'Good Batter'
    elif good_bowler:
        return 'Good Bowler'
    elif (avg_batter) and (avg_bowler):
        return 'All-Rounder'
    elif avg_batter:
        return 'Average Batter'
    elif avg_bowler:
        return 'Average Bowler'
    else:
        return 'Low Performer'

player['Category'] = player.apply(classify_balanced_v2, axis=1)
print(player['Category'].value_counts())
print(player['Category'].value_counts(normalize=True) * 100)



In [None]:
# Famous players to check
famous_players = ['V Kohli', 'MS Dhoni']

# Apply classification function if not already done
player['Category'] = player.apply(classify_balanced_v2, axis=1)

# Filter and display categories
for name in famous_players:
    result = player[player['Player'].str.lower() == name.lower()]
    if not result.empty:
        print(f"{name}: {result['Category'].values[0]}")
    else:
        print(f"{name}: Not found in dataset.")


In [None]:
print(player.columns.tolist())

In [None]:
player.isna().sum()

In [None]:
player = player.fillna(0)
player = player[~np.isinf(player.select_dtypes(include=[np.number])).any(axis=1)].copy()


In [None]:
player.isna().sum()

In [None]:
player.info()

##EDA

In [None]:
# Histograms for numerical features
numeric_cols = player.select_dtypes(include=np.number).columns
finite_cols = [col for col in numeric_cols if np.isfinite(player[col]).all()]
player[finite_cols].hist(bins=30, figsize=(18, 15), color='skyblue')
plt.suptitle("Histograms of Numerical Features", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:

# Bar plots for categorical variables

top_batters = player.sort_values(by='Total Batter Runs', ascending=False).head(10)

plt.figure(figsize=(10, 5))
sns.barplot(x='Player', y='Total Batter Runs', data=top_batters)
plt.title("Top 10 Batters by Total Runs")
plt.xlabel("Player")
plt.ylabel("Total Runs")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



In [None]:
# Correlation matrix
corr_matrix = player.select_dtypes(include=['float64', 'int64']).corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap of Numerical Features in Deliveries Dataset")
plt.tight_layout()
plt.show()

##Handling Outliers

In [None]:

numeric_cols = player.select_dtypes(include='number').columns
n_cols = 4
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 5, n_rows * 4))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(y=player[col], color='skyblue')
    plt.title(col)

plt.suptitle("Boxplots for Numerical Columns in Player Dataset", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()


In [None]:
def iqr_trim(s, k=1.5):
    while True:
        q1, q3 = s.quantile([0.25, 0.75])
        iqr = q3 - q1
        lb, ub = q1 - k * iqr, q3 + k * iqr
        new_s = s.where((s >= lb) & (s <= ub))
        if new_s.equals(s):
            return new_s.dropna()
        s = new_s.dropna()


##Encoding

In [None]:
player.head()

In [None]:
categorical_cols = player.select_dtypes(include='object').columns
print(categorical_cols)


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
player['Category_Label'] = LabelEncoder().fit_transform(player['Category'])





In [None]:
player.head()

In [None]:
player.describe()



##Scaling

In [None]:

from sklearn.preprocessing import StandardScaler
columns_to_scale = [
    'Total Batter Runs', 'Batter Balls Faced', 'Strike Rate',
    'Wicket', 'Bowler Runs Conceded', 'Balls Bowled',
    'Overs Bowled', 'Economy Rate'
]
scaler = StandardScaler()
scaled_values = scaler.fit_transform(player[columns_to_scale])
scaled_df = pd.DataFrame(scaled_values, columns=columns_to_scale)
player_scaled = pd.concat([player[['Player']], scaled_df, player[['Category','Category_Label']]], axis=1)
player = player_scaled



In [None]:
player.describe()

In [None]:
player.head()

##Train Test Split

In [None]:

player = player.dropna()
x = player.drop(columns=['Player', 'Category', 'Category_Label'], errors='ignore')
y = player['Category_Label']
print("x shape:", x.shape)
print("y shape:", y.shape)


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

###Logistic Regression

In [None]:

from sklearn.linear_model import LogisticRegression
log=LogisticRegression()

In [None]:
model_log=log.fit(x_train,y_train)
pred_log=model_log.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

accuracy = accuracy_score(y_test, pred_log)
precision = precision_score(y_test, pred_log, average='weighted')
recall = recall_score(y_test, pred_log, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')


###KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
x_train_sample = x_train.copy()
y_train_sample = y_train.copy()

metric_k = []
neighbors = np.arange(3, 15)

for k in neighbors:
    k_model = KNeighborsClassifier(n_neighbors=k, metric='minkowski', n_jobs=-1)
    k_model.fit(x_train_sample, y_train_sample)
    y_pred = k_model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    metric_k.append(acc)


In [None]:
plt.plot(neighbors,metric_k,'o-')
plt.xlabel('K value')
plt.ylabel('Accuracy')
plt.grid()

In [None]:
k_model=KNeighborsClassifier(n_neighbors=8,metric='minkowski')

In [None]:
 k_model.fit(x_train,y_train)
 y_pred=k_model.predict(x_test)
 acc=accuracy_score(y_test,y_pred)
 print("Accuracy",acc)

###SVM

In [None]:

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

svm_model = LinearSVC(max_iter=1000, dual=False)
svm_model.fit(x_train, y_train)
y_pred = svm_model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)


###Decccision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
dt_pred = model.predict(x_test)
print("=== Decision Tree ===")
print(f"Accuracy: {accuracy_score(y_test, dt_pred):.2f}")
print(f"Precision (macro): {precision_score(y_test, dt_pred, average='macro'):.2f}")
print("Classification Report:\n", classification_report(y_test, dt_pred))


###Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)
rf_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)

print("=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("Precision (macro):", precision_score(y_test, rf_pred, average='macro'))
print("Classification Report:\n", classification_report(y_test, rf_pred))

###Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(x_train, y_train)
nb_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)

print("=== Naive Bayes ===")
print("Accuracy:", accuracy_score(y_test, nb_pred))
print("Precision (macro):", precision_score(y_test, nb_pred, average='macro'))
print("Classification Report:\n", classification_report(y_test, nb_pred))

###XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, classification_report
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(x_train, y_train)
xgb_pred = xgb_model.predict(x_test)
y_proba_xgb = xgb_model.predict_proba(x_test)
print("=== XGBoost Classifier ===")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print("Precision (macro):", precision_score(y_test, xgb_pred, average='macro'))
print("Classification Report:\n", classification_report(y_test, xgb_pred))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

models = {
    "Logistic Regression": LogisticRegression(),
    "KNN (k=8)": KNeighborsClassifier(n_neighbors=8),
    "SVM": LinearSVC(max_iter=1000, dual=False),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
}

results = []

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted')

    results.append((name, acc, prec, rec))

print("=== Model Comparison ===")
print(f"{'Model':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10}")
for name, acc, prec, rec in results:
    print(f"{name:<20} {acc:<10.4f} {prec:<10.4f} {rec:<10.4f}")


##Data Visualization

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
dt_pred = models["Decision Tree"].predict(x_test)
xgb_pred = models["XGBoost"].predict(x_test)

fig, axs = plt.subplots(1, 2, figsize=(12, 5))
ConfusionMatrixDisplay.from_predictions(y_test, dt_pred, ax=axs[0], cmap='Blues')
axs[0].set_title("Decision Tree")
ConfusionMatrixDisplay.from_predictions(y_test, xgb_pred, ax=axs[1], cmap='Oranges')
axs[1].set_title("XGBoost")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss')
cv_scores = cross_val_score(xgb_model, x, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())
print("Standard deviation:", cv_scores.std())



In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss')
xgb_model.fit(x_train, y_train)
xgb_pred = xgb_model.predict(x_test)
print("=== XGBoost Classification Report ===")
print(classification_report(y_test, xgb_pred))


##HPT

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}


In [None]:

xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss')
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)



In [None]:
from sklearn.metrics import accuracy_score
best_xgb_model = grid_search.best_estimator_
y_best_pred = best_xgb_model.predict(x_test)
accuracy_best = accuracy_score(y_test, y_best_pred)
print("Best XGBoost Accuracy:", accuracy_best)


##Prediction

In [None]:

new_player = pd.DataFrame([{
    'Total Batter Runs': 560,
    'Batter Balls Faced': 420,
    'Strike Rate': 133.3,
    'Wicket': 12,
    'Bowler Runs Conceded': 350,
    'Balls Bowled': 480,
    'Overs Bowled': 80,
    'Economy Rate': 7.0
}])
prediction = models["XGBoost"].predict(new_player)
label_map = {
    0: "All-Rounder",
    1: "Average Batter",
    2: "Average Bowler",
    3: "Good Batter",
    4: "Good Bowler",
    5: "Low Performer"
}
print("Predicted Label:", label_map[prediction[0]])



##Deep Learning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Step 1: Load and clean deliveries.csv
deliveries = pd.read_csv("deliveries.csv", on_bad_lines='skip', low_memory=False)
deliveries.columns = deliveries.columns.str.strip()
deliveries.fillna(0, inplace=True)

# Step 2: Create derived features temporarily (used only for labeling, not as input)
deliveries['Strike Rate'] = (deliveries['Total Batter Runs'] / deliveries['Batter Balls Faced'].replace(0, 1)) * 100
deliveries['Economy Rate'] = (deliveries['Bowler Runs Conceded'] / deliveries['Valid Ball'].replace(0, 1)) * 6

# Step 3: Define classification logic for labeling (used only here)
def classify(row):
    if row['Total Batter Runs'] >= 30 and row['Strike Rate'] >= 130:
        return 'good batter'
    elif row['Bowler Runs Conceded'] <= 25 and row['Economy Rate'] <= 6:
        return 'good bowler'
    else:
        return 'average'

# Create target label
deliveries['performance_category'] = deliveries.apply(classify, axis=1)

# Step 4: Define input features (excluding derived columns to force learning)
features = ['Total Batter Runs', 'Batter Balls Faced', 'Bowler Runs Conceded', 'Valid Ball']
X = deliveries[features]
y = deliveries['performance_category']

# Step 5: Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 6: Normalize input features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Step 8: Build Neural Network model
model = Sequential([
    Dense(32, input_dim=X.shape[1], activation='relu'),
    Dense(16, activation='relu'),
    Dense(3, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 9: Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\n✅ Deliveries.csv - Test Accuracy: {accuracy:.2f}")


#**Frontend**

In [None]:
!pkill -f streamlit


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib
matches = pd.read_csv("matches.csv")
matches.fillna("Unknown", inplace=True)
columns_needed = [
    'city', 'match_type', 'player_of_match', 'venue',
    'toss_decision', 'umpire1', 'umpire2'
]
for col in columns_needed:
    le = LabelEncoder()
    matches[col] = le.fit_transform(matches[col])
    joblib.dump(le, f"{col}_encoder.pkl")
    print(f"✅ Saved: {col}_encoder.pkl")


In [None]:
for team_col in ['team1', 'team2', 'toss_winner']:
    le = LabelEncoder()
    matches[team_col] = le.fit_transform(matches[team_col])
    joblib.dump(le, f"{team_col}_encoder.pkl")
    print(f"✅ Saved: {team_col}_encoder.pkl")


In [None]:
ls


In [None]:
!mv xgb_best_model.pkl ipl_model.pkl


In [None]:
!pip install streamlit pyngrok --quiet


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
import numpy as np
import os
import warnings
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import plotly.express as px

warnings.filterwarnings('ignore')

# Configure page
st.set_page_config(page_title="IPL Analytics Dashboard", layout="wide")
st.title("🏏 IPL Analytics Hub")

# Custom CSS
st.markdown("""
<style>
    .stAlert, .stWarning { display: none !important; }
    .stException { border-left: 3px solid #ff2b2b !important; }
    .stMarkdown { margin-bottom: 1rem; }
    .stTabs [data-baseweb="tab-list"] { gap: 10px; }
    .stTabs [data-baseweb="tab"] {
        padding: 8px 16px;
        border-radius: 4px 4px 0 0;
        background: #f0f2f6;
    }
    .stTabs [aria-selected="true"] {
        background: white;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    [data-testid="stMetricValue"] { font-size: 1.2rem; }
    div[data-testid="stExpander"] div[role="button"] p {
        font-size: 1.2rem;
        font-weight: bold;
    }
    .metric-card {
        background: #f0f2f6;
        padding: 15px;
        border-radius: 10px;
        margin-bottom: 15px;
    }
    .stDataFrame { width: 100%; }
</style>
""", unsafe_allow_html=True)

# Load data and models
@st.cache_resource
def load_assets():
    assets = {
        "model": None,
        "encoders": {},
        "deliveries": None,
        "matches": None,
        "stacking_model": None,
        "scaler": None
    }

    try:
        if os.path.exists("ipl_model.pkl"):
            assets["model"] = joblib.load("ipl_model.pkl")
        else:
            st.warning("Model file 'ipl_model.pkl' not found. Match prediction will not be available.")

        encoder_files = {
            "city": "city_encoder.pkl",
            "match_type": "match_type_encoder.pkl",
            "player_of_match": "player_of_match_encoder.pkl",
            "venue": "venue_encoder.pkl",
            "team1": "team1_encoder.pkl",
            "team2": "team2_encoder.pkl",
            "toss_winner": "toss_winner_encoder.pkl",
            "toss_decision": "toss_decision_encoder.pkl",
            "winner": "winner_encoder.pkl"
        }

        for name, file in encoder_files.items():
            if os.path.exists(file):
                assets["encoders"][name] = joblib.load(file)

    except Exception as e:
        st.error(f"⚠ Model loading error: {str(e)}")

    try:
        if os.path.exists("matches.csv"):
            assets["matches"] = pd.read_csv("matches.csv")
            # Ensure required columns exist
            required_matches_cols = ['id', 'season', 'date', 'team1', 'team2', 'winner', 'city', 'venue']
            for col in required_matches_cols:
                if col not in assets["matches"].columns:
                    assets["matches"][col] = np.nan
            if 'date' in assets["matches"].columns:
                assets["matches"]['date'] = pd.to_datetime(assets["matches"]['date'], errors='coerce')
        else:
            st.error("Matches data file 'matches.csv' not found. Some features may be limited.")

    except Exception as e:
        st.error(f"❌ Data loading error: {str(e)}")

    return assets

assets = load_assets()

# Create tabs
tab1, tab2 = st.tabs(["🎯 Match Predictor", "⭐ Player Performance"])

# TAB 1: Match Predictor
with tab1:
    st.header("IPL Match Winner Predictor")

    if assets["model"] is None:
        st.error("❌ Prediction model not available. Please ensure 'ipl_model.pkl' is in the same directory.")
    else:
        form = st.form("prediction_form")
        with form:
            col1, col2 = st.columns(2)

            with col1:
                # Season selection
                if assets["matches"] is not None and not assets["matches"]['season'].isna().all():
                    seasons = sorted(assets["matches"]['season'].dropna().unique().tolist(), reverse=True)
                else:
                    seasons = list(range(2008, 2024))
                season = st.selectbox("Season", seasons, index=0)

                # City selection
                if "city" in assets["encoders"]:
                    cities = sorted(assets["encoders"]["city"].classes_.tolist())
                elif assets["matches"] is not None and not assets["matches"]['city'].isna().all():
                    cities = sorted(assets["matches"]['city'].dropna().unique().tolist())
                else:
                    cities = ["Mumbai", "Chennai", "Kolkata", "Delhi"]
                city = st.selectbox("City", cities, index=0)

                # Match type - assuming fixed types if not encoded
                match_types = ["Group Stage", "Playoff", "Final"]
                if "match_type" in assets["encoders"]:
                    match_types = assets["encoders"]["match_type"].classes_.tolist()
                match_type = st.selectbox("Match Type", match_types, index=0)

                # Venue
                if "venue" in assets["encoders"]:
                    venues = sorted(assets["encoders"]["venue"].classes_.tolist())
                elif assets["matches"] is not None and not assets["matches"]['venue'].isna().all():
                    venues = sorted(assets["matches"]['venue'].dropna().unique().tolist())
                else:
                    venues = ["Wankhede Stadium", "Eden Gardens", "M. Chinnaswamy Stadium"]
                venue = st.selectbox("Venue", venues, index=0)

            with col2:
                # Team selection
                teams = []
                if "team1" in assets["encoders"]:
                    teams = sorted(assets["encoders"]["team1"].classes_.tolist())
                elif assets["matches"] is not None:
                    teams = sorted(list(set(assets["matches"]['team1'].dropna().unique()) | set(assets["matches"]['team2'].dropna().unique())))
                else:
                    teams = ["Chennai Super Kings", "Mumbai Indians",
                           "Royal Challengers Bangalore", "Delhi Capitals"]

                team1 = st.selectbox("Team 1", teams, index=0)
                team2_options = [t for t in teams if t != team1]
                team2 = st.selectbox("Team 2", team2_options, index=min(1, len(team2_options)-1))

                # Toss info
                toss_winner = st.selectbox("Toss Winner", [team1, team2], index=0)

                toss_decisions = ["bat", "field"]
                if "toss_decision" in assets["encoders"]:
                    toss_decisions = assets["encoders"]["toss_decision"].classes_.tolist()
                toss_decision = st.selectbox("Toss Decision", toss_decisions, index=0)

            submitted = form.form_submit_button("Predict Winner", type="primary")

        if submitted:
            with st.spinner("Analyzing match..."):
                try:
                    # Prepare input data
                    input_data = {
                        "season": int(season),
                        "city": city,
                        "match_type": match_type,
                        "venue": venue,
                        "team1": team1,
                        "team2": team2,
                        "toss_winner": toss_winner,
                        "toss_decision": toss_decision.lower(),
                    }

                    # Encode categorical features
                    encoded_data = {}
                    for feature in input_data:
                        if feature in assets["encoders"]:
                            try:
                                encoded_data[feature] = assets["encoders"][feature].transform([input_data[feature]])[0]
                            except ValueError:
                                st.warning(f"Category '{input_data[feature]}' for '{feature}' not seen during training. Using default value.")
                                encoded_data[feature] = 0
                        else:
                            encoded_data[feature] = input_data[feature]

                    # Create DataFrame for prediction
                    input_df = pd.DataFrame([encoded_data])

                    if hasattr(assets["model"], 'feature_names_in_'):
                        missing_features = set(assets["model"].feature_names_in_) - set(input_df.columns)
                        for feature in missing_features:
                            input_df[feature] = 0
                        input_df = input_df[assets["model"].feature_names_in_]
                    else:
                        st.warning("Model does not have 'feature_names_in_'. Assuming feature order is consistent.")

                    # Make prediction
                    try:
                        prediction = assets["model"].predict(input_df)[0]

                        # Determine winner based on prediction
                        if isinstance(prediction, (int, float, np.integer)):
                            winner = team1 if prediction == 0 else team2
                        else:
                            winner = str(prediction)

                        # Get confidence score if available
                        confidence = None
                        if hasattr(assets["model"], 'predict_proba'):
                            proba = assets["model"].predict_proba(input_df)[0]
                            confidence = np.max(proba) * 100

                    except Exception as e:
                        st.error(f"Prediction failed: {str(e)}")
                        winner = "Prediction error"
                        confidence = None

                    # Display results
                    st.balloons()
                    col1, col2 = st.columns(2)
                    with col1:
                        st.markdown(f"""
                        <div class="metric-card">
                            <h3>🏆 Predicted Winner</h3>
                            <h2 style='color:#0068c9'>{winner}</h2>
                        </div>
                        """, unsafe_allow_html=True)

                    if confidence is not None:
                        with col2:
                            st.markdown(f"""
                            <div class="metric-card">
                                <h3>📊 Confidence Score</h3>
                                <h2 style='color:#0068c9'>{confidence:.1f}%</h2>
                            </div>
                            """, unsafe_allow_html=True)

                    if assets["matches"] is not None:
                        past_matches = assets["matches"][
                            ((assets["matches"]["team1"] == team1) &
                             (assets["matches"]["team2"] == team2)) |
                            ((assets["matches"]["team1"] == team2) &
                             (assets["matches"]["team2"] == team1))
                        ]
                        if not past_matches.empty:
                            team1_wins = len(past_matches[past_matches['winner'] == team1])
                            team2_wins = len(past_matches[past_matches['winner'] == team2])
                            st.write(f"*Head-to-Head*: {team1} {team1_wins}-{team2_wins} {team2}")
                            st.subheader("Past Matches between these teams:")
                            st.dataframe(past_matches[['date', 'winner', 'result', 'venue']].sort_values('date', ascending=False))
                        else:
                            st.info("No past matches found between these two teams in the dataset.")

                except Exception as e:
                    st.error(f"❌ Processing failed: {str(e)}")

# TAB 2: Player Performance
with tab2:
    st.header("Player Performance Analyzer")

    # Simplified performance categories
    performance_categories = {
        0: "⭐ Emerging Player",
        1: "🏏 Specialist Batter",
        2: "🎯 Specialist Bowler",
        3: "🌟 Star All-rounder",
        4: "💎 Consistent Performer",
        5: "👑 Match Winner"
    }

    # Create a simple form for player stats
    with st.form("player_form"):
        st.subheader("Enter Player Statistics")

        col1, col2 = st.columns(2)

        with col1:
            batting_avg = st.number_input("Batting Average", min_value=0.0, max_value=100.0, value=25.0)
            strike_rate = st.number_input("Strike Rate", min_value=0.0, max_value=200.0, value=120.0)
            runs = st.number_input("Total Runs", min_value=0, value=1000)
            fifties = st.number_input("50s", min_value=0, value=10)

        with col2:
            bowling_avg = st.number_input("Bowling Average", min_value=0.0, max_value=100.0, value=30.0)
            economy = st.number_input("Economy Rate", min_value=0.0, max_value=15.0, value=7.5)
            wickets = st.number_input("Total Wickets", min_value=0, value=50)
            best_figures = st.text_input("Best Bowling Figures", value="3/20")

        submitted_player = st.form_submit_button("Analyze Performance")

    if submitted_player:
        # Simple heuristic-based performance evaluation
        batting_score = (batting_avg * strike_rate / 100) + (fifties * 5)
        bowling_score = (100 - bowling_avg) * (10 - economy)
        total_score = batting_score + bowling_score

        if batting_score > 1500 and bowling_score > 1500:
            category = 3  # Star All-rounder
        elif batting_score > 2000:
            if bowling_score > 500:
                category = 4  # Consistent Performer
            else:
                category = 1  # Specialist Batter
        elif bowling_score > 2000:
            if batting_score > 500:
                category = 4  # Consistent Performer
            else:
                category = 2  # Specialist Bowler
        elif total_score > 2000:
            category = 5  # Match Winner
        else:
            category = 0  # Emerging Player

        st.success(f"### Performance Category: {performance_categories[category]}")

        # Show some insights
        with st.expander("Performance Insights"):
            st.write(f"**Batting Score:** {batting_score:.1f} (Average: {batting_avg}, SR: {strike_rate})")
            st.write(f"**Bowling Score:** {bowling_score:.1f} (Average: {bowling_avg}, Economy: {economy})")

            if category == 0:
                st.info("This player shows potential but needs more experience to become a consistent performer.")
            elif category == 1:
                st.info("A reliable batter who can anchor the innings or accelerate as needed.")
            elif category == 2:
                st.info("A wicket-taking bowler who can bowl economically in pressure situations.")
            elif category == 3:
                st.info("A rare all-round talent who contributes significantly with both bat and ball.")
            elif category == 4:
                st.info("A dependable player who performs consistently across matches.")
            else:
                st.info("A match-winner who can single-handedly change the course of the game.")

        # Simple visualization
        data = {
            'Metric': ['Batting', 'Bowling', 'Overall'],
            'Score': [batting_score, bowling_score, total_score]
        }
        fig = px.bar(data, x='Metric', y='Score', title='Performance Breakdown')
        st.plotly_chart(fig, use_container_width=True)

In [None]:

!ngrok config add-authtoken <YOUR TOKEN>

In [None]:
import os
import threading
from pyngrok import ngrok

def run():
    os.system("streamlit run app.py")

thread = threading.Thread(target=run)
thread.start()

public_url = ngrok.connect(8501)
print("🌐 Streamlit app is live at:", public_url)
