In [1]:
# Basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, confusion_matrix, classification_report

# Load the ODI Match Data
df = pd.read_csv('ODI_Match_Data.csv')

# Display first few rows
print(df.head())

# Check column names and types
print(df.info())

# Basic Cleanup (optional, depending on dataset)
df.dropna(inplace=True)


  df = pd.read_csv('ODI_Match_Data.csv')


   match_id   season  start_date                           venue  innings  \
0   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
1   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
2   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
3   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
4   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   

   ball batting_team bowling_team     striker   non_striker  ... wides  \
0   0.1        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
1   0.2        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
2   0.3        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
3   0.4        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
4   0.5        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   

   noballs  byes  legbyes  penalty  wicket_type  player_dismissed  \
0      NaN   NaN      N

In [3]:
# Select important features
X = df[['current_score', 'wickets', 'overs', 'venue_id', 'batting_team_id']]  # Adjust based on your data
y = df['final_score']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and Train the Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)

# Evaluate
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))

# Plot actual vs predicted
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Final Score')
plt.ylabel('Predicted Final Score')
plt.title('First Innings Score Prediction')
plt.show()



KeyError: "None of [Index(['current_score', 'wickets', 'overs', 'venue_id', 'batting_team_id'], dtype='object')] are in the [columns]"

In [5]:
# Select important features
features = ['current_score', 'wickets', 'target', 'overs', 'venue_id']
X = df[features]
y = df['result']  # Assume 1 = Win, 0 = Lose

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and Train Naive Bayes Classifier
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict
y_pred = nb.predict(X_test)

# Evaluate
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


KeyError: "None of [Index(['current_score', 'wickets', 'target', 'overs', 'venue_id'], dtype='object')] are in the [columns]"

In [7]:
# Features for Bowler Selection
bowler_features = ['economy', 'dot_ball_percentage', 'bowling_average', 'wickets']
X = df[bowler_features]
y = df['best_bowler']  # Assuming this is the best bowler label in dataset

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print('Random Forest Accuracy:', accuracy_score(y_test, rf_preds))

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)

print('Decision Tree Accuracy:', accuracy_score(y_test, dt_preds))

# SVM
svm = SVC()
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)

print('SVM Accuracy:', accuracy_score(y_test, svm_preds))


KeyError: "None of [Index(['economy', 'dot_ball_percentage', 'bowling_average', 'wickets'], dtype='object')] are in the [columns]"

In [9]:
# Top Scorers
top_scorers = df.groupby('batsman_name')['runs'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=top_scorers.values, y=top_scorers.index, palette='viridis')
plt.title('Top 10 Batsmen by Runs')
plt.xlabel('Total Runs')
plt.ylabel('Batsman')
plt.show()

# Best Bowlers
top_bowlers = df.groupby('bowler_name')['wickets'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=top_bowlers.values, y=top_bowlers.index, palette='rocket')
plt.title('Top 10 Bowlers by Wickets')
plt.xlabel('Total Wickets')
plt.ylabel('Bowler')
plt.show()


KeyError: 'batsman_name'

In [11]:
# Cricket Score Prediction, Death Over Bowler Selection and NLP Analysis

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report
import random

# Load ODI Match Data
df = pd.read_csv('ODI_Match_Data.csv')

# Preprocessing
print(df.head())
print(df.info())
df.dropna(inplace=True)

# ---- Machine Learning Part ----

# Score Prediction (First Innings) using Linear Regression
X = df[['current_score', 'wickets', 'overs', 'venue_id', 'batting_team_id']]
y = df['final_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('Linear Regression MAE:', mean_absolute_error(y_test, y_pred))
print('Linear Regression R2:', r2_score(y_test, y_pred))

plt.scatter(y_test, y_pred)
plt.xlabel('Actual Final Score')
plt.ylabel('Predicted Final Score')
plt.title('First Innings Score Prediction')
plt.show()

# Winning Prediction (Second Innings) using Naive Bayes
X = df[['current_score', 'wickets', 'target', 'overs', 'venue_id']]
y = df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print('Naive Bayes Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Death Over Bowler Selection
bowler_features = ['economy', 'dot_ball_percentage', 'bowling_average', 'wickets']
X = df[bowler_features]
y = df['best_bowler']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print('Random Forest Accuracy (Bowler Selection):', accuracy_score(y_test, rf_preds))

# ---- NLP Part ----

# Simulate 250 natural commentary lines
nlp = spacy.load('en_core_web_sm')

bowlers = ['Bumrah', 'Starc', 'Malinga', 'Rabada', 'Nortje']
pitch_types = ['yorker', 'bouncer', 'slower ball', 'short ball', 'full toss']
ground_regions = ['midwicket', 'cover', 'long-off', 'third man', 'square leg']

commentary_data = []

for i in range(250):
    bowler = random.choice(bowlers)
    pitch = random.choice(pitch_types)
    ground = random.choice(ground_regions)
    comment = f"{bowler} delivers a {pitch}, batsman mistimes towards {ground}!"
    commentary_data.append(comment)

# NLP Extraction
def extract_from_commentary(comments):
    data = []
    for comment in comments:
        doc = nlp(comment.lower())
        bowler = None
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                bowler = ent.text
        ground_zone = [g for g in ground_regions if g in comment.lower()]
        pitch_zone = [p for p in pitch_types if p in comment.lower()]
        data.append({
            'comment': comment,
            'bowler': bowler,
            'ground_zone': ground_zone,
            'pitch_zone': pitch_zone
        })
    return pd.DataFrame(data)

nlp_df = extract_from_commentary(commentary_data)
print(nlp_df.head())

# Bowler Ranking
bowler_rank = nlp_df['bowler'].value_counts()
print("Top Bowlers for Death Overs:")
print(bowler_rank)

# Visualization
bowler_rank.plot(kind='bar', color='teal')
plt.title('Death Over Specialist Bowlers (Based on NLP Commentary)')
plt.xlabel('Bowler')
plt.ylabel('Number of Successful Deliveries')
plt.show()

# ---- Player Stats Visualization ----

# Top 10 Batsmen
top_batsmen = df.groupby('batsman_name')['runs'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_batsmen.values, y=top_batsmen.index, palette='Blues_d')
plt.title('Top 10 Batsmen by Total Runs')
plt.xlabel('Runs')
plt.ylabel('Batsman')
plt.show()

# Top 10 Bowlers
top_bowlers = df.groupby('bowler_name')['wickets'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_bowlers.values, y=top_bowlers.index, palette='Reds')
plt.title('Top 10 Bowlers by Total Wickets')
plt.xlabel('Wickets')
plt.ylabel('Bowler')
plt.show()


  df = pd.read_csv('ODI_Match_Data.csv')


   match_id   season  start_date                           venue  innings  \
0   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
1   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
2   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
3   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
4   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   

   ball batting_team bowling_team     striker   non_striker  ... wides  \
0   0.1        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
1   0.2        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
2   0.3        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
3   0.4        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
4   0.5        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   

   noballs  byes  legbyes  penalty  wicket_type  player_dismissed  \
0      NaN   NaN      N

KeyError: "None of [Index(['current_score', 'wickets', 'overs', 'venue_id', 'batting_team_id'], dtype='object')] are in the [columns]"

In [15]:
# Load original ball-by-ball dataset
df = pd.read_csv('ODI_Match_Data.csv')

# Calculate total runs per ball
df['total_runs'] = df['runs_off_bat'] + df['extras']

# Create 'over' column
df['over'] = df['ball'].astype(int)

# Create 'is_wicket' column
df['is_wicket'] = df['wicket_type'].notnull().astype(int)

# Summarize for each match, innings, over
summary = df.groupby(['match_id', 'venue', 'innings', 'over', 'batting_team']).agg({
    'total_runs': 'sum',
    'is_wicket': 'sum'
}).reset_index()

# Calculate cumulative runs and cumulative wickets within each innings
summary['cumulative_runs'] = summary.groupby(['match_id', 'innings'])['total_runs'].cumsum()
summary['cumulative_wickets'] = summary.groupby(['match_id', 'innings'])['is_wicket'].cumsum()

# Final score per innings (for supervised learning target)
final_scores = summary.groupby(['match_id', 'innings'])['cumulative_runs'].max().reset_index()
final_scores.rename(columns={'cumulative_runs': 'final_score'}, inplace=True)

# Merge final scores into summary
summary = pd.merge(summary, final_scores, on=['match_id', 'innings'], how='left')

# Create pseudo match result based on innings 1 vs innings 2 score
match_results = final_scores.pivot(index='match_id', columns='innings', values='final_score').reset_index()
match_results.columns = ['match_id', '1st_innings_score', '2nd_innings_score']
match_results['result'] = (match_results['2nd_innings_score'] > match_results['1st_innings_score']).astype(int)  # 1 = Win chasing

# Merge result into main summary
summary = pd.merge(summary, match_results[['match_id', 'result']], on='match_id', how='left')

# View final processed dataset
print(summary.head())


  df = pd.read_csv('ODI_Match_Data.csv')


ValueError: Length mismatch: Expected axis has 5 elements, new values have 3 elements

In [17]:
# Result calculation (Safe version)
match_results = final_scores.pivot(index='match_id', columns='innings', values='final_score').reset_index()

# Filter only matches with 2 innings
match_results = match_results[[col for col in match_results.columns if col in [0, 1, 'match_id']]]

# Rename properly
match_results.columns = ['match_id', '1st_innings_score', '2nd_innings_score']

# Calculate result
match_results['result'] = (match_results['2nd_innings_score'] > match_results['1st_innings_score']).astype(int)

# Merge back
summary = pd.merge(summary, match_results[['match_id', 'result']], on='match_id', how='left')


ValueError: Length mismatch: Expected axis has 2 elements, new values have 3 elements

In [19]:
# Result calculation (Corrected Safe Version)

# Pivot to get 1st and 2nd innings final scores
match_results = final_scores.pivot(index='match_id', columns='innings', values='final_score').reset_index()

# Check pivoted columns first
print("Pivoted columns:", match_results.columns.tolist())

# Rename carefully
# We check first what innings are available (0,1) or (1,2)
# Sometimes innings are numbered differently, so do it carefully

# If innings are 1 and 2
if 1 in match_results.columns and 2 in match_results.columns:
    match_results = match_results[['match_id', 1, 2]]
    match_results.columns = ['match_id', '1st_innings_score', '2nd_innings_score']
elif 0 in match_results.columns and 1 in match_results.columns:
    match_results = match_results[['match_id', 0, 1]]
    match_results.columns = ['match_id', '1st_innings_score', '2nd_innings_score']
else:
    raise ValueError("Unexpected innings numbering! Check your dataset.")

# Now calculate result
match_results['result'] = (match_results['2nd_innings_score'] > match_results['1st_innings_score']).astype(int)

# Merge result back into the summary dataset
summary = pd.merge(summary, match_results[['match_id', 'result']], on='match_id', how='left')

# Done! ✅
print(summary.head())


Pivoted columns: ['match_id', 1, 2, 3, 4]
   match_id                venue  innings  over batting_team  total_runs  \
0     64814  McLean Park, Napier        1     0  New Zealand           2   
1     64814  McLean Park, Napier        1     1  New Zealand           7   
2     64814  McLean Park, Napier        1     2  New Zealand           1   
3     64814  McLean Park, Napier        1     3  New Zealand           2   
4     64814  McLean Park, Napier        1     4  New Zealand           1   

   is_wicket  cumulative_runs  cumulative_wickets  final_score  result  
0          0                2                   0          254       0  
1          0                9                   0          254       0  
2          1               10                   1          254       0  
3          0               12                   1          254       0  
4          0               13                   1          254       0  


In [21]:
# Result calculation (correct method)

# Pivot
match_results = final_scores.pivot(index='match_id', columns='innings', values='final_score').reset_index()

# Let's print pivoted columns to understand
print("Pivoted Columns:", match_results.columns.tolist())

# Now Select Only innings 1 and 2
required_columns = ['match_id']

# Try to select innings 1 and 2 safely
if 1 in match_results.columns and 2 in match_results.columns:
    required_columns += [1, 2]
elif 0 in match_results.columns and 1 in match_results.columns:
    required_columns += [0, 1]
else:
    raise Exception("Cannot find standard innings numbers in data!")

# Create filtered match_results
match_results = match_results[required_columns]

# Rename the columns properly
match_results.columns = ['match_id', '1st_innings_score', '2nd_innings_score']

# Calculate Match Result: 1 if chasing team wins, 0 otherwise
match_results['result'] = (match_results['2nd_innings_score'] > match_results['1st_innings_score']).astype(int)

# Now merge match result into your main summary data
summary = pd.merge(summary, match_results[['match_id', 'result']], on='match_id', how='left')

# Done! 🎯
print(summary.head())


Pivoted Columns: ['match_id', 1, 2, 3, 4]
   match_id                venue  innings  over batting_team  total_runs  \
0     64814  McLean Park, Napier        1     0  New Zealand           2   
1     64814  McLean Park, Napier        1     1  New Zealand           7   
2     64814  McLean Park, Napier        1     2  New Zealand           1   
3     64814  McLean Park, Napier        1     3  New Zealand           2   
4     64814  McLean Park, Napier        1     4  New Zealand           1   

   is_wicket  cumulative_runs  cumulative_wickets  final_score  result_x  \
0          0                2                   0          254         0   
1          0                9                   0          254         0   
2          1               10                   1          254         0   
3          0               12                   1          254         0   
4          0               13                   1          254         0   

   result_y  
0         0  
1         0  
2 

In [23]:
# Corrected Extraction Function
def extract_from_commentary(comments):
    data = []
    for comment in comments:
        comment_lower = comment.lower()
        
        # Manual bowler extraction (before 'delivers')
        if 'delivers' in comment_lower:
            bowler = comment_lower.split('delivers')[0].strip()
        else:
            bowler = None
        
        # Ground and pitch detection
        ground_zone = [g for g in ground_regions if g in comment_lower]
        pitch_zone = [p for p in pitch_types if p in comment_lower]
        
        data.append({
            'comment': comment,
            'bowler': bowler,
            'ground_zone': ground_zone,
            'pitch_zone': pitch_zone
        })
    
    return pd.DataFrame(data)

# Now run this corrected function
nlp_df = extract_from_commentary(commentary_data)
print(nlp_df.head())


NameError: name 'commentary_data' is not defined