In [1]:
# Cricket Score Prediction, Death Over Bowler Selection and NLP Analysis

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report
import random

# Load ODI Match Data
df = pd.read_csv('ODI_Match_Data.csv')

# Preprocessing
print(df.head())
print(df.info())
df.dropna(inplace=True)

# ---- Machine Learning Part ----

# Score Prediction (First Innings) using Linear Regression
X = df[['current_score', 'wickets', 'overs', 'venue_id', 'batting_team_id']]
y = df['final_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('Linear Regression MAE:', mean_absolute_error(y_test, y_pred))
print('Linear Regression R2:', r2_score(y_test, y_pred))

plt.scatter(y_test, y_pred)
plt.xlabel('Actual Final Score')
plt.ylabel('Predicted Final Score')
plt.title('First Innings Score Prediction')
plt.show()

# Winning Prediction (Second Innings) using Naive Bayes
X = df[['current_score', 'wickets', 'target', 'overs', 'venue_id']]
y = df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print('Naive Bayes Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Death Over Bowler Selection
bowler_features = ['economy', 'dot_ball_percentage', 'bowling_average', 'wickets']
X = df[bowler_features]
y = df['best_bowler']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print('Random Forest Accuracy (Bowler Selection):', accuracy_score(y_test, rf_preds))

# ---- NLP Part ----

# Simulate 250 natural commentary lines
nlp = spacy.load('en_core_web_sm')

bowlers = ['Bumrah', 'Starc', 'Malinga', 'Rabada', 'Nortje']
pitch_types = ['yorker', 'bouncer', 'slower ball', 'short ball', 'full toss']
ground_regions = ['midwicket', 'cover', 'long-off', 'third man', 'square leg']

commentary_data = []

for i in range(250):
    bowler = random.choice(bowlers)
    pitch = random.choice(pitch_types)
    ground = random.choice(ground_regions)
    comment = f"{bowler} delivers a {pitch}, batsman mistimes towards {ground}!"
    commentary_data.append(comment)

# NLP Extraction
def extract_from_commentary(comments):
    data = []
    for comment in comments:
        doc = nlp(comment.lower())
        bowler = None
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                bowler = ent.text
        ground_zone = [g for g in ground_regions if g in comment.lower()]
        pitch_zone = [p for p in pitch_types if p in comment.lower()]
        data.append({
            'comment': comment,
            'bowler': bowler,
            'ground_zone': ground_zone,
            'pitch_zone': pitch_zone
        })
    return pd.DataFrame(data)

nlp_df = extract_from_commentary(commentary_data)
print(nlp_df.head())

# Bowler Ranking
bowler_rank = nlp_df['bowler'].value_counts()
print("Top Bowlers for Death Overs:")
print(bowler_rank)

# Visualization
bowler_rank.plot(kind='bar', color='teal')
plt.title('Death Over Specialist Bowlers (Based on NLP Commentary)')
plt.xlabel('Bowler')
plt.ylabel('Number of Successful Deliveries')
plt.show()

# ---- Player Stats Visualization ----

# Top 10 Batsmen
top_batsmen = df.groupby('batsman_name')['runs'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_batsmen.values, y=top_batsmen.index, palette='Blues_d')
plt.title('Top 10 Batsmen by Total Runs')
plt.xlabel('Runs')
plt.ylabel('Batsman')
plt.show()

# Top 10 Bowlers
top_bowlers = df.groupby('bowler_name')['wickets'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_bowlers.values, y=top_bowlers.index, palette='Reds')
plt.title('Top 10 Bowlers by Total Wickets')
plt.xlabel('Wickets')
plt.ylabel('Bowler')
plt.show()


  df = pd.read_csv('ODI_Match_Data.csv')


   match_id   season  start_date                           venue  innings  \
0   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
1   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
2   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
3   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   
4   1389389  2023/24  2023-09-24  Holkar Cricket Stadium, Indore        1   

   ball batting_team bowling_team     striker   non_striker  ... wides  \
0   0.1        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
1   0.2        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
2   0.3        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
3   0.4        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   
4   0.5        India    Australia  RD Gaikwad  Shubman Gill  ...   NaN   

   noballs  byes  legbyes  penalty  wicket_type  player_dismissed  \
0      NaN   NaN      N

KeyError: "None of [Index(['current_score', 'wickets', 'overs', 'venue_id', 'batting_team_id'], dtype='object')] are in the [columns]"

In [5]:
import pandas as pd

df = pd.read_csv('ODI_Match_Data.csv')  # or your local file path

# Show the real column names
print(df.columns.tolist())


  df = pd.read_csv('ODI_Match_Data.csv')  # or your local file path


['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed', 'cricsheet_id']
