In [6]:
import pandas as pd 
import json 
import os 

# Things which can't be tracked from this data set =  

In [7]:
def getPlayerStats(innings):
    
    # Store player stats in a dictionary
    player_stats = {}
    
    for inning in innings:
        team = inning.get('team')
        overs = inning.get('overs', [])
        
        for over in overs:
            over_number = over.get('over')
            deliveries = over.get('deliveries', [])  # Ensure deliveries is a list
            
            for delivery in deliveries:
                batter = delivery.get('batter')
                runs = delivery.get('runs', {}).get('batter', 0)
                bowler = delivery.get('bowler')
                non_striker = delivery.get('non_striker')
                
                # Initialize stats for the batter if not already present
                if batter not in player_stats:
                    player_stats[batter] = {
                        'team' : team,
                        'total_runs': 0,
                        'wickets': None  # Set as None initially
                    }

                # Update batter's total runs
                player_stats[batter]['total_runs'] += runs
                
                # Check if batter got out on this delivery
                if "wickets" in delivery:
                    # Get the wicket information (if present)
                    dismissal_info = delivery.get("wickets", [])
    
                    # Extract wicket details
                    player_out = dismissal_info[0]["player_out"]
                    dismissal_type = dismissal_info[0]['kind']
                   
                    # Only update if this batter is the one who got out
                    if player_out == batter:
                        player_stats[batter]["wickets"] = {
                            "over": over_number,
                            "bowler": bowler,
                            "dismissal_type": dismissal_type,
                            "player_out": player_out
                        }
    
    return player_stats



In [8]:
#function which flattens json data 
def flatten_json(json_data):
    
    # Extract the basic match info
    match_info = json_data.get('info', {})
    match_date = match_info.get('dates', [None])[0]
    venue = match_info.get('venue')
    teams = match_info.get('teams')

    # Toss details
    toss = match_info.get('toss', {})
    toss_winner = toss.get('winner')
    toss_decision = toss.get('decision')

    # Outcome details
    outcome = match_info.get('outcome', {})
    winner = outcome.get('winner')
    margin = outcome.get('by', None)

    # Get the team which won the toss
    if teams[0] == toss_winner:
        toss_loser = teams[1]
    else:
        toss_loser = teams[0]

    # Get the team which is chasing
    if toss_decision == "field":
        chasing_team = toss_winner
        target_team = toss_loser
    else:
        chasing_team = toss_loser
        target_team = toss_winner

    target_runs = None

    # Iterate through all innings to find the target runs
    for inning in json_data.get('innings', []):
        if inning.get('team') == chasing_team:
            target_info = inning.get('target', {})
            target_runs = target_info.get('runs', None)
            break  # Exit the loop once the correct inning is found
        
        
    # get first 2 strikers from each team 
    players = match_info.get('players')
    
    #For chasing team
    chasing_striker = players.get(chasing_team, [None,None])[0]
    chasing_non_striker = players.get(chasing_team,[None,None])[1]
    
    #For team setting the target 
    target_striker = players.get(target_team,[None,None])[0]
    target_non_striker = players.get(target_team,[None,None])[1]
    
    # return over they got out in and number of runs each / highest partnership
    #find when partnership ends based upon when the striker gets out
    innings = json_data.get('innings', [])
    player_stats = getPlayerStats(innings)
    
        
    #access the team list - get first two names - trace that down to innings - loop through overs until name for striker and or non-striker changes
    # get the over number , combined runs
    # subtract that from total runs to see their impact [if losing opening batsmen has impact on team performance]

    # bowlers - opening bowler how long it takes them to get a wicket on average 
    # track it based upon the changing of either batsmen 
    
    #if the same bowlers are reintroduced later on - impact on team or constantly changin bowlers 
    
    #look at player of the match 
    
    #check how players are most likely to get out
    

    return {
        'match_date': match_date,
        'venue': venue,
        'winner': winner,
        'margin': margin,
        'chasing_team': chasing_team,
        'toss_winner': toss_winner,
        'toss_loser': toss_loser,
        'toss_decision': toss_decision,
        'target_runs': target_runs,
        'target_striker': target_striker,
        'target_non_striker':target_non_striker,
        'chasing_striker': chasing_striker,
        'chasing_non_striker': chasing_non_striker,
        # 'teams':player_stats['team'],
        # 'total_runs':player_stats['total_runs'],
        # 'wickets':player_stats['wickets']
}

    
    
    

In [9]:
folder_path = '/Users/dankhan/Documents/cricketPredictor/ipl_json'
flattened_data_list = []

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):  # Only process JSON files
        with open(os.path.join(folder_path, filename), 'r') as file:
            json_data = json.load(file)
            flattened_data = flatten_json(json_data)
            flattened_data_list.append(flattened_data)
            
            
matches = pd.DataFrame(flattened_data_list)
matches


KeyError: 'team'

In [None]:
matches['toss_winner'].value_counts()

In [None]:
matches["toss_loser"].value_counts()

In [None]:
#pune warriors, rising pune supergiant/s
#deccan chargers
#gujurat lions 
#rcb is times 2 
#dehli capitals and daredevils = same team
#Kochi Tuskers Kerala 



# Chennai Super Kings

# Delhi Capitals

# Gujarat Titans

# Kolkata Knight Riders

# Lucknow Super Giants

# Mumbai Indians

# Punjab Kings

# Rajasthan Royals

# Royal Challengers Bangalore

# Sunrisers Hyderabad


In [None]:
matches.dtypes

# Cleaning Data for ML 

In [None]:
matches["match_date"] = pd.to_datetime(matches['match_date']) 
matches

In [None]:
matches.dtypes

In [None]:
# base winning a match on the toss decision - whether they chose to bowl or bat
matches['decision_code'] = matches["toss_decision"].astype("category").cat.codes

In [None]:
matches.dtypes

In [None]:
matches["opp_code"] = matches["toss_loser"].astype("category").cat.codes

In [None]:
#each opponent now has their own code 
matches

In [None]:
matches["day_code"] = matches["match_date"].dt.day_of_week
matches

In [None]:
import numpy as np

matches["result"] = np.where(matches["toss_winner"] == matches["winner"], 'W', 'L')
matches

In [None]:
matches["target"] = (matches["result"] == "W").astype("int")
matches

# Creating initial machine learning model

In [None]:
from sklearn.ensemble import RandomForestClassifier
#ml model picks up non linearirties in data 

In [None]:
#series of decision trees, each decision different parameters
rf = RandomForestClassifier(n_estimators=500, min_samples_split=10,random_state=1)

In [None]:
train = matches[matches["match_date"] < '2024-01-01']
test = matches[matches["match_date"] > '2024-01-01'] 

In [None]:
predictors = ["decision_code","opp_code","day_code"]

In [None]:
#train a random forest model - with the predictors trying to reach the targer
rf.fit(train[predictors], train["target"])

In [None]:
preds = rf.predict(test[predictors])

In [None]:
#determine accuracy of the model
#metric what % of time did team acc win and vice versa
#what % was prediction acurate
from sklearn.metrics import accuracy_score

In [None]:
acc = accuracy_score(test["target"],preds)

In [None]:
acc

In [None]:
#see which situation our accuracy was high or low
combined = pd.DataFrame(dict(actual = test["target"],prediction = preds))

In [None]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

In [None]:
from sklearn.metrics import precision_score

In [None]:
#when we predicted win what % of time did team acc win
precision_score(test["target"], preds)

In [None]:
#team won 38% of the time

# improving precision with rolling averages

In [None]:
grouped_matches = matches.groupby("winner")


In [None]:
group = grouped_matches.get_group("Sunrisers Hyderabad")

In [None]:
group

In [None]:
def rolling_averages(group, cols , new_cols):
    group = group.sort_values("date")
    #removes current match date
    rolling_stats = group[cols].rolling(3,closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols) #removes rows with missing bals 
    
    return group 

In [None]:
#distinguishes the margin between number of wickets and runs 
#cols we want to compute rolling averagte for:
#average runs per over 
# first 2 batsmans performance 
#cols i want to computer olling averages for 
cols = [""]



In [None]:
def convert_margin(margin_dict):
    if isinstance(margin_dict, dict):
        if 'runs' in margin_dict:
            return margin_dict['runs'], margin_dict['runs']  # No change if it's runs
        elif 'wickets' in margin_dict:
            return margin_dict['wickets'], margin_dict['wickets'] / 10  # Divide by 10 if it's wickets
    return None, None  # Return None if margin_dict is not a valid dictionary

# Apply the function to the DataFrame
matches[['margin_value', 'needed']] = matches['margin'].apply(lambda x: pd.Series(convert_margin(x)))

# Display the DataFrame with the new columns
matches

In [None]:
cols = []