In [45]:
import pandas as pd

# Efficiently load only the necessary columns
df = pd.read_csv("ODI_Match_Data.csv", usecols=[
    'match_id', 'innings', 'ball', 'runs_off_bat', 'extras', 'player_dismissed'
])

# Filter second innings
df = df[df['innings'] == 2]

# Save as a smaller file
df.to_csv("second_innings_filtered.csv", index=False)

################################################################################

# Assuming your filtered DataFrame is named `df`
df['total_runs'] = df['runs_off_bat'] + df['extras']
df['over'] = df['ball'].astype(str).str.extract(r'^(\d+)\.').astype(float)

# Group by match and estimate target score
df['target_score'] = df.groupby('match_id')['total_runs'].transform('sum') + 1
df['current_score'] = df.groupby('match_id')['total_runs'].cumsum()
df['is_wicket'] = df['player_dismissed'].notna().astype(int)
df['wickets_lost'] = df.groupby('match_id')['is_wicket'].cumsum()
df['wickets_in_hand'] = 10 - df['wickets_lost']
df['overs_remaining'] = 50 - df['over']
df['run_rate'] = df['current_score'] / (50 - df['overs_remaining'] + 0.1)
df['required_run_rate'] = (df['target_score'] - df['current_score']) / df['overs_remaining'].replace(0, 1)
df['match_pressure_index'] = df['required_run_rate'] / (df['run_rate'] + 0.1)

# Match phase
df['match_phase'] = pd.cut(df['over'], bins=[0, 10, 40, 50], labels=['early', 'middle', 'death'])
df = pd.get_dummies(df, columns=['match_phase'])

# Get the final ball of each second-innings match
final_ball_df = df.groupby('match_id').tail(1).copy()
final_ball_df['match_result'] = (final_ball_df['current_score'] >= final_ball_df['target_score']).astype(int)
final_ball_df['toss_winner'] = 0
final_ball_df['venue_advantage'] = 0


features = [
    'target_score', 'current_score', 'wickets_in_hand', 'overs_remaining',
    'required_run_rate', 'toss_winner', 'venue_advantage', 'match_pressure_index',
    'match_phase_early', 'match_phase_middle', 'match_phase_death'
]

#######################################################################################


########################################################################################

X = final_ball_df[features].dropna()
y = final_ball_df.loc[X.index, 'match_result']


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

import joblib
joblib.dump(model, "match_outcome_model_retrained.pkl")

['match_outcome_model_retrained.pkl']

In [47]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("ODI_Match_Data.csv", low_memory=False)

# Step 1: Total runs per ball
df['total_runs'] = df['runs_off_bat'] + df['extras']

# Step 2: Extract over number
df['over'] = df['ball'].astype(str).str.extract(r'^(\d+)\.').astype(float)

# Step 3: Identify first and second innings
first_innings = df[df['innings'] == 1].groupby('match_id')['total_runs'].sum().reset_index()
first_innings.columns = ['match_id', 'target_score']

# Step 4: Focus on second innings only
second_df = df[df['innings'] == 2].copy()

# Merge target score into second innings
second_df = pd.merge(second_df, first_innings, on='match_id', how='left')

# Step 5: Cumulative features
second_df['current_score'] = second_df.groupby('match_id')['total_runs'].cumsum()
second_df['is_wicket'] = second_df['player_dismissed'].notna().astype(int)
second_df['wickets_lost'] = second_df.groupby('match_id')['is_wicket'].cumsum()
second_df['wickets_in_hand'] = 10 - second_df['wickets_lost']
second_df['overs_remaining'] = 50 - second_df['over']
second_df['required_run_rate'] = (second_df['target_score'] - second_df['current_score']) / second_df['overs_remaining'].replace(0, np.nan)

# Step 6: Match Phase
second_df['match_phase'] = pd.cut(second_df['over'], bins=[0, 10, 40, 50], labels=['early_phase', 'middle_phase', 'death_phase'])
second_df = pd.get_dummies(second_df, columns=['match_phase'])

# Step 7: Match Pressure Index
second_df['run_rate'] = second_df['current_score'] / (50 - second_df['overs_remaining'] + 0.1)
second_df['match_pressure_index'] = second_df['required_run_rate'] / (second_df['run_rate'] + 0.1)

# Optional placeholders for features not directly in data
second_df['toss_winner'] = 0  # Placeholder, unless calculated
second_df['venue_advantage'] = 0  # Placeholder, unless mapped from venue



In [49]:
features = [
    'target_score', 'current_score', 'wickets_in_hand',
    'overs_remaining', 'required_run_rate',
    'toss_winner', 'venue_advantage', 'match_pressure_index',
    'match_phase_early_phase', 'match_phase_middle_phase', 'match_phase_death_phase'
]

# For label
second_df['final_score'] = second_df.groupby('match_id')['current_score'].transform('max')
second_df['match_result'] = (second_df['current_score'] >= second_df['target_score']).astype(int)

# Drop missing or divide-by-zero errors
model_data = second_df.dropna(subset=features + ['match_result'])

X = model_data[features]
y = model_data['match_result']


In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Model Accuracy:", model.score(X_test, y_test))
print(classification_report(y_test, model.predict(X_test)))


Model Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    115043
           1       1.00      1.00      1.00       467

    accuracy                           1.00    115510
   macro avg       1.00      1.00      1.00    115510
weighted avg       1.00      1.00      1.00    115510



In [41]:
print(final_ball_df['match_result'].value_counts())


match_result
0    2316
Name: count, dtype: int64


In [38]:
# Step 1: Pivot the data with innings as columns
pivot = innings_scores.pivot(index='match_id', columns='innings', values='total_runs')

# Step 2: Keep only matches that have exactly 2 innings (1st and 2nd)
pivot = pivot[[1, 2]].dropna()
pivot.columns = ['innings_1', 'innings_2']

# Step 3: Derive match result
pivot['match_result'] = (pivot['innings_2'] > pivot['innings_1']).astype(int)


In [40]:
final_df = final_df.merge(pivot[['match_result']], left_on='match_id', right_index=True, how='inner')


In [17]:
final_df

NameError: name 'final_df' is not defined

In [42]:
X = final_df[features].dropna()
y = final_df['match_result']


KeyError: "['target_score', 'toss_winner', 'venue_advantage'] not in index"

In [44]:
features = [
    'current_score', 'wickets_in_hand', 'overs_remaining',
    'required_run_rate', 'match_pressure_index',
    'match_phase_early', 'match_phase_middle', 'match_phase_death'
]


In [46]:
X = final_df[features].dropna()
y = final_df['match_result']


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

joblib.dump(model, "match_outcome_model_retrained.pkl")


['match_outcome_model_retrained.pkl']

Death_Over_Process

In [50]:
import pandas as pd

# Load your Excel file
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Filter for death overs (16–20)
death_overs_df = df[(df['over'] >= 16.0) & (df['over'] <= 20.0)].copy()

# Prepare metrics
death_overs_df['bowlerName'] = death_overs_df['bowlerName'].fillna('Unknown')
death_overs_df['isDotBall'] = (death_overs_df['runs'] == 0).astype(int)
death_overs_df['isExtra'] = death_overs_df['isWide'] | death_overs_df['isNoBall']
death_overs_df['isWicket'] = death_overs_df['isWicket'].astype(bool).astype(int)

# Aggregate performance
bowler_stats = death_overs_df.groupby('bowlerName').agg(
    balls_bowled=('over', 'count'),
    dot_balls=('isDotBall', 'sum'),
    runs_conceded=('bowlerRuns', 'sum'),
    extras=('isExtra', 'sum'),
    wickets=('isWicket', 'sum')
).reset_index()

# Calculated fields
bowler_stats['dot_percent'] = (bowler_stats['dot_balls'] / bowler_stats['balls_bowled']) * 100
bowler_stats['economy'] = bowler_stats['runs_conceded'] / (bowler_stats['balls_bowled'] / 6)
bowler_stats['discipline_ratio'] = (1 - (bowler_stats['extras'] / bowler_stats['balls_bowled'])) * 100

# Top performers
top_bowlers = bowler_stats.sort_values(by=['wickets', 'dot_percent'], ascending=False).head(10)
print(top_bowlers)


         bowlerName  balls_bowled  dot_balls  runs_conceded  extras  wickets  \
21     Chris Jordan            37         14             52       2       10   
8    Arshdeep Singh            54         22             62       6        9   
73    Naveen-ul-Haq            44         16             58       4        7   
80      Pat Cummins            43         13             47       1        7   
84   Rishad Hossain            30         16             34       0        6   
48    Kagiso Rabada            57         24             59       3        6   
6     Andre Russell            32         12             55       1        6   
103     Trent Boult            24         15             10       0        5   
100    Taskin Ahmed            31         17             28       1        5   
40   Jasprit Bumrah            46         24             29       0        5   

     dot_percent    economy  discipline_ratio  
21     37.837838   8.432432         94.594595  
8      40.740741   6.88

In [56]:
import pandas as pd

# Load your T20 World Cup dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Filter death overs (16 to 20)
death_overs_df = df[(df['over'] >= 16.0) & (df['over'] <= 20.0)].copy()

# Preprocess
death_overs_df['bowlerName'] = death_overs_df['bowlerName'].fillna('Unknown')
death_overs_df['isDotBall'] = (death_overs_df['runs'] == 0).astype(int)
death_overs_df['isExtra'] = death_overs_df['isWide'] | death_overs_df['isNoBall']
death_overs_df['isWicket'] = death_overs_df['isWicket'].astype(bool).astype(int)

# Group by bowler
bowler_stats = death_overs_df.groupby('bowlerName').agg(
    balls_bowled=('over', 'count'),
    dot_balls=('isDotBall', 'sum'),
    runs_conceded=('bowlerRuns', 'sum'),
    extras=('isExtra', 'sum'),
    wickets=('isWicket', 'sum')
).reset_index()

# Calculate stats
bowler_stats['dot_percent'] = (bowler_stats['dot_balls'] / bowler_stats['balls_bowled']) * 100
bowler_stats['economy'] = bowler_stats['runs_conceded'] / (bowler_stats['balls_bowled'] / 6)
bowler_stats['discipline_ratio'] = (1 - (bowler_stats['extras'] / bowler_stats['balls_bowled'])) * 100

# Save to CSV
bowler_stats.to_csv("death_over_bowler_stats.csv", index=False)


In [59]:
import streamlit as st
import pandas as pd

st.set_page_config(page_title="Death Over Bowler Recommender", page_icon="🏏")
st.title("🏏 Best Death Over Bowler Recommender")

st.markdown("""
Select a pressure level and get recommended bowlers based on wickets, dot ball %, economy, and discipline (death overs only).
""")

# 📁 Load pre-computed stats (create this CSV from your notebook)
try:
    bowler_stats = pd.read_csv("death_over_bowler_stats.csv")
except FileNotFoundError:
    st.error("Please make sure 'death_over_bowler_stats.csv' exists in the same folder.")
    st.stop()

# 🔍 Scenario Selection
scenario = st.selectbox("Select Pressure Scenario", ["Low", "Medium", "High"])

# 🎯 Sort logic based on scenario
if scenario == "Low":
    result = bowler_stats.sort_values(by=["discipline_ratio", "dot_percent"], ascending=False)
elif scenario == "Medium":
    result = bowler_stats.sort_values(by=["wickets", "dot_percent"], ascending=False)
else:  # High pressure
    result = bowler_stats.sort_values(by=["wickets", "discipline_ratio"], ascending=False)

# 📊 Display top 5
st.subheader("Top Recommended Bowlers")
st.dataframe(result.head(5)[["bowlerName", "wickets", "dot_percent", "economy", "discipline_ratio"]])

st.caption("🔍 Based on overs 16–20 only (death overs)")

DeltaGenerator()

In [61]:
import pandas as pd

# Load the dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Fill missing names
df['batsmanName'] = df['batsmanName'].fillna("Unknown")

# Batting stats
batsman_stats = df.groupby('batsmanName').agg(
    matches=('matchID', 'nunique'),
    innings=('inningID', 'nunique'),
    total_runs=('batsmanRuns', 'sum'),
    balls_faced=('batsmanBall', 'sum'),
    boundaries=('isBoundary', 'sum'),
).reset_index()

# Metrics
batsman_stats = batsman_stats[batsman_stats['balls_faced'] > 0]
batsman_stats['batting_avg'] = batsman_stats['total_runs'] / batsman_stats['innings']
batsman_stats['strike_rate'] = (batsman_stats['total_runs'] / batsman_stats['balls_faced']) * 100
batsman_stats['boundary_rate'] = batsman_stats['boundaries'] / batsman_stats['balls_faced'] * 100

# Normalize
for col in ['batting_avg', 'strike_rate', 'boundary_rate']:
    batsman_stats[f'{col}_z'] = (batsman_stats[col] - batsman_stats[col].mean()) / batsman_stats[col].std()

# Save
batsman_stats.to_csv("batsman_performance_stats.csv", index=False)


In [66]:
import pandas as pd

# Load the match dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# 🔧 Preprocessing
df['bowlerName'] = df['bowlerName'].fillna('Unknown')
df['isWicket'] = df['isWicket'].astype(int)
df['isDotBall'] = (df['runs'] == 0).astype(int)
df['isExtra'] = df['isWide'].fillna(0) + df['isNoBall'].fillna(0)

# 🧮 Aggregated stats per bowler
bowler_stats = df.groupby('bowlerName').agg(
    matches=('matchID', 'nunique'),
    balls_bowled=('bowlerRuns', 'count'),
    runs_conceded=('bowlerRuns', 'sum'),
    wickets=('isWicket', 'sum'),
    dot_balls=('isDotBall', 'sum'),
    extras=('isExtra', 'sum')
).reset_index()

# 🔍 Filter bowlers with actual deliveries
bowler_stats = bowler_stats[bowler_stats['balls_bowled'] > 0]

# 📊 Derived metrics
bowler_stats['economy'] = bowler_stats['runs_conceded'] / (bowler_stats['balls_bowled'] / 6)
bowler_stats['strike_rate'] = bowler_stats['balls_bowled'] / bowler_stats['wickets'].replace(0, 1)
bowler_stats['dot_percent'] = (bowler_stats['dot_balls'] / bowler_stats['balls_bowled']) * 100
bowler_stats['discipline_ratio'] = 100 - ((bowler_stats['extras'] / bowler_stats['balls_bowled']) * 100)
bowler_stats['wickets_per_match'] = bowler_stats['wickets'] / bowler_stats['matches']

# 🧮 Z-score Normalization (for radar plots, clustering, ML)
for col in ['economy', 'strike_rate', 'dot_percent', 'discipline_ratio', 'wickets_per_match']:
    z_col = f"{col}_z"
    bowler_stats[z_col] = (bowler_stats[col] - bowler_stats[col].mean()) / bowler_stats[col].std()

# 💾 Save the file
bowler_stats.to_csv("bowler_performance_stats.csv", index=False)
print("✅ Bowler stats saved to 'bowler_performance_stats.csv'")


✅ Bowler stats saved to 'bowler_performance_stats.csv'


In [68]:
import pandas as pd

# Load your Excel dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Preprocessing
df['batting_team'] = df['batting_team'].fillna('Unknown')
df['over'] = df['over'].astype(float)
df['runs'] = df['runs'].fillna(0)
df['isWicket'] = df['isWicket'].astype(int)

# Phase categorization
def assign_phase(over):
    if over < 6:
        return 'Powerplay'
    elif over < 16:
        return 'Middle'
    else:
        return 'Death'

df['phase'] = df['over'].apply(assign_phase)

# Aggregate by team and phase
team_phase_stats = df.groupby(['batting_team', 'phase']).agg(
    runs_total=('runs', 'sum'),
    balls_faced=('over', 'count'),
    wickets_lost=('isWicket', 'sum')
).reset_index()

# Calculate run rate
team_phase_stats['run_rate'] = team_phase_stats['runs_total'] / (team_phase_stats['balls_faced'] / 6)

# Pivot table for visual use
phase_pivot = team_phase_stats.pivot(index='batting_team', columns='phase', values='run_rate').fillna(0).reset_index()

# Save for Streamlit or Power BI
phase_pivot.to_csv("team_phase_efficiency.csv", index=False)


KeyError: 'batting_team'

In [70]:
import pandas as pd

df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")
print(df.columns.tolist())


['matchID', 'matchNo', 'match', 'currentInning', 'inningID', 'over', 'runningOver', 'runs', 'batsmanRuns', 'batsmanBall', 'bowlerRuns', 'shortText', 'batsmanPlayerID', 'batsmanName', 'bowlerPlayerID', 'bowlerName', 'isBoundary', 'isWide', 'isNoBall', 'isLegBye', 'isBye', 'isWicket', 'isBowlerWicket', 'wicketText', 'wktBatsmanName', 'wktBowlerName', 'wktBatsmanRuns', 'wktbatsmanBalls', 'commentary']


In [72]:
import pandas as pd

# Load dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Convert types and clean
df['over'] = df['over'].astype(float)
df['runs'] = df['runs'].fillna(0)
df['isWicket'] = df['isWicket'].astype(int)
df['batsmanName'] = df['batsmanName'].fillna("Unknown")

# Phase classification
def assign_phase(over):
    if over < 6:
        return 'Powerplay'
    elif over < 16:
        return 'Middle'
    else:
        return 'Death'

df['phase'] = df['over'].apply(assign_phase)

# Aggregate phase stats per match and batsman (proxy team)
phase_stats = df.groupby(['matchID', 'batsmanName', 'phase']).agg(
    runs_total=('runs', 'sum'),
    balls_faced=('over', 'count'),
    wickets_lost=('isWicket', 'sum')
).reset_index()

phase_stats['run_rate'] = phase_stats['runs_total'] / (phase_stats['balls_faced'] / 6)

# Pivot to get phase-wise view per batsman
pivot = phase_stats.pivot_table(index='batsmanName', columns='phase', values='run_rate', aggfunc='mean').fillna(0).reset_index()

# Save to CSV
pivot.to_csv("phase_efficiency_batsman_proxy.csv", index=False)
print("✅ Saved to phase_efficiency_batsman_proxy.csv")


✅ Saved to phase_efficiency_batsman_proxy.csv


In [None]:
##Streamlit Process

In [77]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# Load ODI Match Data
df = pd.read_csv("ODI_Match_Data.csv", low_memory=False)

# Basic cleaning
df['venue'] = df['venue'].fillna("Unknown")
df['batting_team'] = df['batting_team'].fillna("Unknown")
df['bowling_team'] = df['bowling_team'].fillna("Unknown")

# For simulation, we’ll assume:
# Toss winner = batting_team in 1st innings
df['toss_decision'] = df['innings'].apply(lambda x: 'bat' if x == 1 else 'bowl')

# Aggregate outcome per match
df['total_runs'] = df['runs_off_bat'] + df['extras']
match_scores = df.groupby(['match_id', 'innings'])['total_runs'].sum().unstack()
match_scores['match_result'] = (match_scores[2] > match_scores[1]).astype(int)

# Merge with toss and venue info
meta = df[['match_id', 'venue', 'batting_team', 'innings']].drop_duplicates()
toss_info = meta[meta['innings'] == 1].copy()
toss_info = toss_info.rename(columns={'batting_team': 'toss_winner'})
toss_info['toss_decision'] = 'bat'
toss_info = toss_info[['match_id', 'venue', 'toss_winner', 'toss_decision']]
toss_info = toss_info.merge(match_scores[['match_result']], left_on='match_id', right_index=True)

# Encode categorical variables
X = pd.get_dummies(toss_info[['venue', 'toss_winner', 'toss_decision']], drop_first=True)
y = toss_info['match_result']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree
model = DecisionTreeClassifier(max_depth=4, random_state=42)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "toss_decision_model.pkl")

# Report
print("Classification Report:\n")
print(classification_report(y_test, model.predict(X_test)))


Classification Report:

              precision    recall  f1-score   support

           0       0.56      0.97      0.71       258
           1       0.70      0.09      0.16       218

    accuracy                           0.57       476
   macro avg       0.63      0.53      0.43       476
weighted avg       0.62      0.57      0.45       476



In [79]:
import streamlit as st
import pandas as pd
import joblib

# Load model and options
model = joblib.load("toss_decision_model.pkl")
venues = ['Lord\'s', 'MCG', 'Eden Gardens', 'Wankhede', 'Old Trafford']  # Sample venues
teams = ['India', 'Australia', 'England', 'Pakistan', 'South Africa']

# App config
st.set_page_config(page_title="Captaincy Toss Decision Simulator", page_icon="🧢")
st.title("🧢 Toss Decision Recommender")
st.markdown("Simulate the better toss decision based on historical match outcomes.")

# Inputs
venue = st.selectbox("🏟️ Select Venue", venues)
toss_winner = st.selectbox("🏏 Toss Winner", teams)
toss_decision = st.radio("🧭 Toss Decision", ['bat', 'bowl'])

# Create input DataFrame for prediction
input_df = pd.DataFrame({
    'venue': [venue],
    'toss_winner': [toss_winner],
    'toss_decision': [toss_decision]
})
input_encoded = pd.get_dummies(input_df)
model_features = model.feature_names_in_
missing_cols = set(model_features) - set(input_encoded.columns)
for col in missing_cols:
    input_encoded[col] = 0
input_encoded = input_encoded[model_features]

# Predict
prediction = model.predict(input_encoded)[0]
result_label = "👍 Win Likely" if prediction == 1 else "👎 Loss Likely"
st.subheader(f"🎯 Prediction: {result_label}")


  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0
  input_encoded[col] = 0


DeltaGenerator()

In [83]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report
import joblib

# 🔹 Load match data
df = pd.read_csv("ODI_Match_Data.csv", low_memory=False)

# 🔹 Basic cleanup
df['venue'] = df['venue'].fillna("Unknown")
df['batting_team'] = df['batting_team'].fillna("Unknown")
df['bowling_team'] = df['bowling_team'].fillna("Unknown")

# 🔹 Assume toss_winner = batting_team in 1st innings
df['toss_decision'] = df['innings'].apply(lambda x: 'bat' if x == 1 else 'bowl')
df['total_runs'] = df['runs_off_bat'] + df['extras']

# ✅ STEP 1: Total runs per match and innings
innings_runs = df.groupby(['match_id', 'innings'])['total_runs'].sum().reset_index()

# ✅ STEP 2: Filter out matches beyond 2 innings (e.g. Super Over, Tests)
innings_runs = innings_runs[innings_runs['innings'].isin([1, 2])]

# ✅ STEP 3: Pivot to wide format
innings_pivot = innings_runs.pivot(index='match_id', columns='innings', values='total_runs')
innings_pivot = innings_pivot.dropna()  # Keep only matches with both innings

# ✅ STEP 4: Rename columns and compute result
innings_pivot.columns = ['1st_innings', '2nd_innings']
innings_pivot['match_result'] = (innings_pivot['2nd_innings'] > innings_pivot['1st_innings']).astype(int)

# ✅ STEP 5: Extract toss metadata (from 1st innings perspective)
meta = df[['match_id', 'venue', 'batting_team', 'bowling_team', 'innings']].drop_duplicates()
toss_meta = meta[meta['innings'] == 1].copy()
toss_meta['toss_winner'] = toss_meta['batting_team']
toss_meta['toss_decision'] = 'bat'  # Because they batted first

# ✅ STEP 6: Join metadata with match result
toss_df = toss_meta.merge(innings_pivot[['match_result']], left_on='match_id', right_index=True)

# ✅ STEP 7: Handle class imbalance
df_win = toss_df[toss_df['match_result'] == 1]
df_loss = toss_df[toss_df['match_result'] == 0]

df_loss_up = resample(df_loss, replace=True, n_samples=len(df_win), random_state=42)
df_balanced = pd.concat([df_win, df_loss_up])

# ✅ STEP 8: Encode categorical features
X = pd.get_dummies(df_balanced[['venue', 'toss_winner', 'bowling_team', 'toss_decision']], drop_first=True)
y = df_balanced['match_result']

# ✅ STEP 9: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ STEP 10: Train Decision Tree model
model = DecisionTreeClassifier(max_depth=5, random_state=42)
model.fit(X_train, y_train)

# ✅ STEP 11: Save the model
joblib.dump(model, "toss_decision_model_v2.pkl")

# ✅ STEP 12: Evaluate performance
print("📊 Classification Report:")
print(classification_report(y_test, model.predict(X_test)))


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.27      0.38       220
           1       0.53      0.84      0.65       217

    accuracy                           0.55       437
   macro avg       0.58      0.56      0.51       437
weighted avg       0.58      0.55      0.51       437



In [85]:
import pandas as pd

# Load the dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Clean and derive basic info
df['runs'] = df['runs'].fillna(0)
df['isWicket'] = df['isWicket'].astype(int)
df['balls'] = 1  # Every row is one ball

# Group by Bowler vs Batsman
matchups = df.groupby(['bowlerName', 'batsmanName']).agg(
    total_runs=('runs', 'sum'),
    balls_faced=('balls', 'sum'),
    dismissals=('isWicket', 'sum')
).reset_index()

# Calculate Strike Rate
matchups['strike_rate'] = (matchups['total_runs'] / matchups['balls_faced']) * 100
matchups['dismissal_ratio'] = matchups['dismissals'] / matchups['balls_faced']

# Optional: Filter low-ball encounters
matchups = matchups[matchups['balls_faced'] >= 6]  # At least 1 over bowled

# Save to CSV
matchups.to_csv("bowler_vs_batsman_matchups.csv", index=False)
print("✅ Matchup matrix saved to 'bowler_vs_batsman_matchups.csv'")


✅ Matchup matrix saved to 'bowler_vs_batsman_matchups.csv'


In [None]:
#WIN PROBABILITY#

In [None]:
#Logistic regression#

In [87]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib

# Load ball-by-ball data
df = pd.read_csv("ODI_Match_Data.csv", low_memory=False)

# Total runs per ball
df['total_runs'] = df['runs_off_bat'] + df['extras']

# Filter 2nd innings only
df = df[df['innings'] == 2]

# Group by match/over
df['over'] = df['ball'].astype(str).str.extract(r'^(\d+)').astype(float)
grouped = df.groupby(['match_id', 'over']).agg(
    current_score=('total_runs', 'sum'),
    wickets=('player_dismissed', lambda x: x.notna().sum())
).groupby(level=0).cumsum().reset_index()

# Target scores (from 1st innings)
targets = df[df['innings'] == 2].groupby('match_id')['total_runs'].sum().groupby(level=0).sum()
targets.name = 'target_score'

# Merge
grouped = grouped.merge(targets, on='match_id', how='left')
grouped['overs_remaining'] = 50 - grouped['over']
grouped['wickets_in_hand'] = 10 - grouped['wickets']
grouped['required_run_rate'] = (grouped['target_score'] - grouped['current_score']) / grouped['overs_remaining'].replace(0, 1)

# Final label
grouped['match_result'] = (grouped['current_score'] >= grouped['target_score']).astype(int)

# Filter valid
grouped = grouped.dropna()

# Train model
features = ['current_score', 'wickets_in_hand', 'overs_remaining', 'required_run_rate']
X = grouped[features]
y = grouped['match_result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "win_probability_model.pkl")
print("✅ Model trained and saved.")


✅ Model trained and saved.


In [None]:
#XGBBoost##

In [89]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# Load the ODI ball-by-ball dataset
df = pd.read_csv("ODI_Match_Data.csv", low_memory=False)

# Feature engineering
df['total_runs'] = df['runs_off_bat'] + df['extras']
df['over'] = df['ball'].astype(str).str.extract(r'^(\d+)').astype(float)

# Use only 2nd innings
df = df[df['innings'] == 2]

# Group by match and over
grouped = df.groupby(['match_id', 'over']).agg(
    current_score=('total_runs', 'sum'),
    wickets=('player_dismissed', lambda x: x.notna().sum())
).groupby(level=0).cumsum().reset_index()

# Target scores from 1st innings
targets = df.groupby('match_id')['total_runs'].sum().groupby(level=0).sum()
targets.name = 'target_score'

grouped = grouped.merge(targets, on='match_id', how='left')

# Create features
grouped['overs_remaining'] = 50 - grouped['over']
grouped['wickets_in_hand'] = 10 - grouped['wickets']
grouped['required_run_rate'] = (grouped['target_score'] - grouped['current_score']) / grouped['overs_remaining'].replace(0, 1)
grouped['match_result'] = (grouped['current_score'] >= grouped['target_score']).astype(int)

# Clean and drop NaN
grouped = grouped.dropna()

# Features and labels
features = ['current_score', 'wickets_in_hand', 'overs_remaining', 'required_run_rate']
X = grouped[features]
y = grouped['match_result']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, "win_probability_xgb_model.pkl")

# Evaluation
y_pred = model.predict(X_test)
print("✅ XGBoost Model Trained")
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ XGBoost Model Trained
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18396
           1       0.93      0.96      0.94       508

    accuracy                           1.00     18904
   macro avg       0.96      0.98      0.97     18904
weighted avg       1.00      1.00      1.00     18904



In [None]:
###########PRESSURE INDEX #######

In [91]:
import pandas as pd

# Load dataset
df = pd.read_csv("ODI_Match_Data.csv", low_memory=False)

# Use only second innings
df = df[df['innings'] == 2]

# Extract over from ball number
df['over'] = df['ball'].astype(str).str.extract(r'^(\d+)').astype(float)
df['total_runs'] = df['runs_off_bat'] + df['extras']

# Group by match and over to get progressive stats
df_overwise = df.groupby(['match_id', 'over']).agg(
    runs=('total_runs', 'sum'),
    wickets=('player_dismissed', lambda x: x.notna().sum())
).groupby('match_id').cumsum().reset_index()

# Target from 1st innings
targets = df[df['innings'] == 2].groupby('match_id')['total_runs'].sum()
df_overwise = df_overwise.merge(targets, on='match_id', how='left')
df_overwise['overs_completed'] = df_overwise['over']
df_overwise['overs_remaining'] = 50 - df_overwise['over']
df_overwise['current_run_rate'] = df_overwise['runs'] / df_overwise['overs_completed'].replace(0, 0.1)
df_overwise['required_run_rate'] = (df_overwise['total_runs'] - df_overwise['runs']) / df_overwise['overs_remaining'].replace(0, 0.1)

# Pressure Index
df_overwise['pressure_index'] = df_overwise['required_run_rate'] / df_overwise['current_run_rate']

# Save to CSV
df_overwise.to_csv("pressure_index_over_time.csv", index=False)
print("✅ Pressure Index data saved.")


✅ Pressure Index data saved.


In [None]:
############### DRS REVIEW #############

In [93]:
import pandas as pd
import re

# Load the ball-by-ball commentary dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Drop missing commentary
df = df[df['commentary'].notna()]

# Filter lines mentioning LBW or DRS
keywords = ['lbw', 'review', 'umpire', 'drs', 'impact', 'original decision', 'ball tracking', 'review retained', 'review lost']
pattern = '|'.join(keywords)
df['lower_comment'] = df['commentary'].str.lower()
drs_df = df[df['lower_comment'].str.contains(pattern)]

# Optional NLP tag: Is overturn?
def detect_outcome(text):
    if 'overturned' in text or 'changed to out' in text or 'reversed' in text:
        return 'Overturned'
    elif 'original decision stands' in text or 'umpire\'s call' in text:
        return 'Umpire Call'
    elif 'lost review' in text:
        return 'Lost Review'
    elif 'retained review' in text or 'successful review' in text:
        return 'Successful Review'
    return 'Unclear'

drs_df['review_outcome'] = drs_df['lower_comment'].apply(detect_outcome)

# Save it
drs_df.to_csv("drs_review_events.csv", index=False)
print("✅ DRS commentary lines extracted.")


✅ DRS commentary lines extracted.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drs_df['review_outcome'] = drs_df['lower_comment'].apply(detect_outcome)


In [None]:
############# VENUE INSIGHTS #############

In [95]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load ODI data
df = pd.read_csv("ODI_Match_Data.csv", low_memory=False)

# Total runs per delivery
df['total_runs'] = df['runs_off_bat'] + df['extras']

# Get only first innings
df_first = df[df['innings'] == 1]

# Group by venue to get average 1st innings scores
venue_stats = df_first.groupby('venue').agg(
    avg_score=('total_runs', 'sum'),
    deliveries=('ball', 'count')
).reset_index()

# Add run rate
venue_stats['run_rate'] = venue_stats['avg_score'] / (venue_stats['deliveries'] / 6)

# Analyze spin vs pace success
spin_keywords = ['caught and bowled', 'bowled', 'lbw']
pace_keywords = ['caught', 'run out']

df_first['wicket_type'] = df_first['wicket_type'].fillna("none")
df_first['is_spin'] = df_first['wicket_type'].str.contains('|'.join(spin_keywords)).astype(int)
df_first['is_pace'] = df_first['wicket_type'].str.contains('|'.join(pace_keywords)).astype(int)

spin_stats = df_first.groupby('venue')[['is_spin', 'is_pace']].sum().reset_index()
venue_stats = venue_stats.merge(spin_stats, on='venue', how='left')

# Normalize
features = ['avg_score', 'run_rate', 'is_spin', 'is_pace']
X = venue_stats[features].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
venue_stats['cluster'] = kmeans.fit_predict(X_scaled)

# Save
venue_stats.to_csv("venue_clustered_stats.csv", index=False)
print("✅ Clustered venue stats saved.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first['wicket_type'] = df_first['wicket_type'].fillna("none")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first['is_spin'] = df_first['wicket_type'].str.contains('|'.join(spin_keywords)).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_first['is_pace'] = df_first['wicket_type'

✅ Clustered venue stats saved.


In [None]:
##################### MOMENTUM SHIFT ################

In [97]:
import pandas as pd

# Load dataset
df = pd.read_csv("ODI_Match_Data.csv", low_memory=False)

# Use 2nd innings (chasing phase)
df = df[df['innings'] == 2]

# Extract over number
df['over'] = df['ball'].astype(str).str.extract(r'^(\d+)').astype(float)
df['total_runs'] = df['runs_off_bat'] + df['extras']
df['wicket'] = df['player_dismissed'].notna().astype(int)

# Group by match & over
grouped = df.groupby(['match_id', 'over']).agg(
    over_runs=('total_runs', 'sum'),
    wickets=('wicket', 'sum')
).reset_index()

# Run rate per over
grouped['run_rate'] = grouped['over_runs'] / 1.0  # 6 balls = 1 over
grouped['rr_delta'] = grouped.groupby('match_id')['run_rate'].diff().fillna(0)
grouped['wkt_delta'] = grouped.groupby('match_id')['wickets'].diff().fillna(0)

# Label momentum shifts
grouped['momentum_shift'] = ((grouped['rr_delta'].abs() >= 3) | (grouped['wickets'] >= 2)).astype(int)

# Save output
grouped.to_csv("momentum_shift_events.csv", index=False)
print("✅ Momentum shifts extracted.")


✅ Momentum shifts extracted.


In [None]:
######################## LLM SUMMARY ###############

In [99]:
import pandas as pd

# Load commentary dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Filter key info
df = df[['matchID', 'over', 'bowlerName', 'batsmanName', 'runs', 'isWicket', 'commentary']]
df = df.dropna(subset=['commentary'])

# Aggregate over-wise summaries
overwise = df.groupby(['matchID', 'over', 'bowlerName']).agg({
    'runs': 'sum',
    'isWicket': 'sum',
    'commentary': lambda x: ' '.join(x)
}).reset_index()

overwise.to_csv("llm_summary_input.csv", index=False)
print("✅ Over-wise data ready for NLP summarization.")


✅ Over-wise data ready for NLP summarization.


In [101]:
from transformers import pipeline
import pandas as pd

# Load aggregated data
df = pd.read_csv("llm_summary_input.csv")

# Load summarization pipeline (BART works well)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def create_summary(row):
    text = f"Over {row['over']} by {row['bowlerName']}. {row['commentary']}"
    try:
        summary = summarizer(text, max_length=50, min_length=15, do_sample=False)[0]['summary_text']
    except:
        summary = "⚠️ Unable to generate summary"
    return summary

# Apply to each row (limit to top 20 for performance)
df['summary'] = df.head(20).apply(create_summary, axis=1)
df[['matchID', 'over', 'bowlerName', 'summary']].to_csv("llm_generated_summaries.csv", index=False)

print("✅ Summaries generated using LLM.")


ModuleNotFoundError: No module named 'transformers'

In [103]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.1-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/10.5 MB 3.6 MB/s eta 0:00:03
   --- ------------------------------------ 0.9/10.5 MB 10.9 MB/s eta 0:00:01
   --------- ------------------------------ 2.4/10.5 MB 18.8 MB/s eta 0:00:01
   --------------- ------------------------ 4.0/10.5 MB 22.9 MB/s eta 0:00:01
   --------------------- ------------------ 5.5/10.5 MB 25.1 MB/s eta 0:00:01
   ----

In [107]:
from transformers import pipeline


ImportError: Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [105]:
from transformers import pipeline
import pandas as pd

# Load aggregated data
df = pd.read_csv("llm_summary_input.csv")

# Load summarization pipeline (BART works well)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def create_summary(row):
    text = f"Over {row['over']} by {row['bowlerName']}. {row['commentary']}"
    try:
        summary = summarizer(text, max_length=50, min_length=15, do_sample=False)[0]['summary_text']
    except:
        summary = "⚠️ Unable to generate summary"
    return summary

# Apply to each row (limit to top 20 for performance)
df['summary'] = df.head(20).apply(create_summary, axis=1)
df[['matchID', 'over', 'bowlerName', 'summary']].to_csv("llm_generated_summaries.csv", index=False)

print("✅ Summaries generated using LLM.")


ImportError: Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [109]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")
df = pd.read_csv("llm_summary_input.csv")

def rule_based_summary(row):
    commentary = row['commentary'].lower()
    bowler = row['bowlerName']
    runs = row['runs']
    wickets = row['isWicket']
    
    if wickets >= 2:
        return f"{bowler} struck twice in over {row['over']}, a game-changer."
    elif runs <= 4:
        return f"Tight over by {bowler}, giving just {runs} runs."
    elif runs >= 15:
        return f"{bowler} leaked {runs} in over {row['over']}, costly phase."
    else:
        return f"Balanced over from {bowler} with {runs} runs."

df['summary'] = df.apply(rule_based_summary, axis=1)
df[['matchID', 'over', 'bowlerName', 'summary']].to_csv("spacy_rule_summaries.csv", index=False)
print("✅ spaCy summaries saved.")


ImportError: cannot import name '_TrimmedRelease' from 'packaging.version' (C:\Users\ASUS\anaconda3\Lib\site-packages\packaging\version.py)

In [111]:
import spacy
nlp = spacy.load("en_core_web_sm")




In [113]:
from transformers import pipeline
import pandas as pd

# Load aggregated data
df = pd.read_csv("llm_summary_input.csv")

# Load summarization pipeline (BART works well)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def create_summary(row):
    text = f"Over {row['over']} by {row['bowlerName']}. {row['commentary']}"
    try:
        summary = summarizer(text, max_length=50, min_length=15, do_sample=False)[0]['summary_text']
    except:
        summary = "⚠️ Unable to generate summary"
    return summary

# Apply to each row (limit to top 20 for performance)
df['summary'] = df.head(20).apply(create_summary, axis=1)
df[['matchID', 'over', 'bowlerName', 'summary']].to_csv("llm_generated_summaries.csv", index=False)

print("✅ Summaries generated using LLM.")


ImportError: Traceback (most recent call last):
  File "C:\Users\ASUS\anaconda3\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [115]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer("The bowler delivered a brilliant over conceding only 3 runs and taking a wicket.", max_length=30)
print(result[0]['summary_text'])


ModuleNotFoundError: Could not import module 'pipeline'. Are this object's requirements defined correctly?

In [1]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
result = summarizer(
    "The bowler delivered a brilliant over conceding only 3 runs and taking a wicket.",
    max_length=30,
    min_length=10
)
print("✅ Summary:", result[0]['summary_text'])


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 30, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


✅ Summary: The bowler delivered a brilliant over conceding only 3 runs and taking a wicket.


In [3]:
# 🔧 Install required packages if not already
# pip install pandas transformers torch openpyxl

import pandas as pd
from transformers import pipeline

# Load dataset
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")

# Preprocess: keep key fields and clean
df = df[['matchID', 'over', 'bowlerName', 'runs', 'isWicket', 'commentary']]
df = df.dropna(subset=['commentary'])

# Group by over for summarization
agg_df = df.groupby(['matchID', 'over', 'bowlerName']).agg({
    'runs': 'sum',
    'isWicket': 'sum',
    'commentary': lambda x: ' '.join(x)
}).reset_index()

# Load transformer summarizer (BART)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Generate summaries (limit for speed, or loop all)
def generate_summary(row):
    text = f"Over {int(row['over'])} by {row['bowlerName']}. {row['commentary']}"
    try:
        result = summarizer(text, max_length=40, min_length=15, do_sample=False)
        return result[0]['summary_text']
    except:
        return "⚠️ Summary failed."

agg_df['summary'] = agg_df.head(20).apply(generate_summary, axis=1)  # limit rows if needed
agg_df.to_csv("llm_generated_summaries.csv", index=False)

print("✅ LLM-based summaries saved to 'llm_generated_summaries.csv'")


Device set to use cpu
Your max_length is set to 40, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_length is set to 40, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 40, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_length is set to 40, but your input_length is only 22. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', m

✅ LLM-based summaries saved to 'llm_generated_summaries.csv'


In [5]:
import pandas as pd
from transformers import pipeline

# Load and clean data
df = pd.read_excel("T20_WC_24_All_Matches_Dataset.xlsx")
df = df[['matchID', 'over', 'bowlerName', 'batsmanName', 'isWicket', 'runs', 'commentary']]
df = df.dropna(subset=['commentary'])

# Aggregate per over
grouped = df.groupby(['matchID', 'over', 'bowlerName']).agg({
    'runs': 'sum',
    'isWicket': 'sum',
    'batsmanName': lambda x: ', '.join(set(x)),
    'commentary': lambda x: ' '.join(x)
}).reset_index()

# Load summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Define sentiment tagger (basic rule-based)
def get_sentiment(text):
    pos_keywords = ["great", "brilliant", "excellent", "dot ball", "wicket", "tight"]
    neg_keywords = ["poor", "expensive", "wide", "no ball", "bad", "costly"]
    text = text.lower()
    if any(word in text for word in pos_keywords):
        return "Positive"
    elif any(word in text for word in neg_keywords):
        return "Negative"
    return "Neutral"

# Define impact rating
def impact(row):
    if row['isWicket'] >= 2 or row['runs'] <= 4:
        return "High"
    elif row['isWicket'] == 1 or row['runs'] <= 8:
        return "Medium"
    else:
        return "Low"

# Generate summaries
def summarize_row(row):
    text = f"Over {int(row['over'])} by {row['bowlerName']} to {row['batsmanName']}. {row['commentary']}"
    try:
        summary = summarizer(text, max_length=40, min_length=15, do_sample=False)
        return summary[0]['summary_text']
    except:
        return "Summary failed"

grouped['summary'] = grouped.head(20).apply(summarize_row, axis=1)
grouped['sentiment'] = grouped['commentary'].apply(get_sentiment)
grouped['impact'] = grouped.apply(impact, axis=1)

# Save final dataset
grouped.to_csv("llm_commentary_enhanced.csv", index=False)
print("✅ Saved: llm_commentary_enhanced.csv with batsman, sentiment, impact.")


Device set to use cpu
Your max_length is set to 40, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 40, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)
Your max_length is set to 40, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your max_length is set to 40, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', m

✅ Saved: llm_commentary_enhanced.csv with batsman, sentiment, impact.
