**The Dataset used below Contains Players Bowling and Batting Stats from IPL 2016-2022.**



**SOURCE** -- https://www.kaggle.com/datasets/anandkumarsahu09/ipl-player-stats-20162022



**Batting Stats**



POS -- Player's rank based on most runs.

Player -- Player's name

Mat -- Matches played

Inns -- Innings Played

NO -- Number of Not Out in innings

Runs -- Total Runs scored by a player

HS -- Highest Score in innings [* -- Not Out in that Innings]

Avg -- Average

BF -- Bowls faced

SR -- Strike Rate

100 -- No of times 100 scored

50 -- No of the times 50 scored

4s -- Total Fours Scored

6s -- Total Sixes Scored



**Bowling Stats**



POS -- Player's rank based on most wickets.

Player -- Player's name

Mat -- Matches playedn

Inns -- Innings Played

Ov -- Overs

Runs -- Total runs given by bowler

Wkts -- Total Wickets taken

BBI -- Best Bowling in Innings

Avg -- Bowling Average

Econ -- Economy rate

SR -- Strike Rate

4w -- 4 wickets haul

5w -- 5 wickets haul


In [None]:
# Importing necessary libraries
import os
import csv
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

In [None]:
# Function to convert a CSV file data to a list of dictionaries
def csv_to_list_of_dicts(file_path, year):
    data_list = []
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            row['Year'] = year
            data_list.append(row)
    return data_list


In [None]:
# Function to get a list of CSV files in a folder
def get_csv_files_in_folder(folder_path):
    csv_files = []
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            csv_files.append(os.path.join(folder_path, file))
    return csv_files


In [None]:
# Folder paths for batting and bowling data
folder_path_batting = f"E:\\Movies\\Summer Training Project 2023\\kaggle data set\\IPL Player Stats\\Batting Stats"
folder_path_bowling = f"E:\\Movies\\Summer Training Project 2023\\kaggle data set\\IPL Player Stats\\Bowling Stats"

# Get all CSV files in the batting and bowling folders to csv_batting_files and csv_bowling_files rspectively
csv_batting_files = get_csv_files_in_folder(folder_path_batting)
csv_bowling_files = get_csv_files_in_folder(folder_path_bowling)

# Initialize empty lists to store all batting and bowling data
all_bowling_data = []
all_batting_data = []

# Iterate through each batting CSV file, extract data, and append to all_batting_data
for file_path in csv_batting_files:
    year = int(file_path.split('\\')[-1][:-4].split('_')[-1])
    data = csv_to_list_of_dicts(file_path, year)
    all_batting_data.extend(data)

# Iterate through each batting CSV file, extract data, and append to all_bowling_data
for file_path in csv_bowling_files:
    year = int(file_path.split('\\')[-1][:-4].split("_")[1])
    data = csv_to_list_of_dicts(file_path, year)
    all_bowling_data.extend(data)


In [None]:
# Display the data in all_batting data
all_batting_data[:]

In [None]:
# Function to display information about the dataset
def info(df):
    # Assuming the keys of the first dictionary represent column names
    col_names = list(df[0].keys())
    data_types = {col: type(df[0][col]).__name__ for col in col_names}
    non_null_counts = {
        col: sum(1 for row in df if row[col] is not None) for col in col_names}

    print(f"Data Frame Info:")
    print("Number of Rows:", len(df))
    print("Number of Columns:", len(col_names))
    print()

    print("{: <20} {: <20} {: <20}".format(
        "Column Name", "Data Type", "Non-Null Count"))
    print("-" * 60)
    for col in col_names:
        print("{: <20} {: <20} {: <20}".format(
            col, data_types[col], non_null_counts[col]))


In [None]:
# Display information about all_batting_data
batting_info = info(all_batting_data)

In [None]:
info(all_bowling_data)

In [None]:
# Count the occurrences of each player's name
player_bat_name_counts = Counter(entry['Player'] for entry in all_batting_data)
player_bowl_name_counts = Counter(entry['Player'] for entry in all_bowling_data)

# Filter out players with less than 6 occurrences in batting data and bowling data
all_batting_data = [entry for entry in all_batting_data if player_bat_name_counts[entry['Player']] >= 6]
all_bowling_data = [entry for entry in all_bowling_data if player_bowl_name_counts[entry['Player']] >= 6]


In [None]:
#Displaying the players' name that are filtered out
for row in all_batting_data:
    print(row['Player'])

In [None]:
#Displaying the players' name that are filtered out
for row in all_bowling_data:
    print(row['Player'])

In [None]:
# Display the data in all_bowling data
all_bowling_data[:]

In [None]:
# Display information about all_bowling_data
bowling_info = info(all_bowling_data)

In [None]:
# Data Cleaning and Transformation

# Convert 'Avg' values to string, replace '-' with '0', strip whitespace, and convert to float
for row in all_batting_data:
    row['Avg'] = str(row['Avg']).replace("-", "0").strip()
    row['Avg'] = float(row['Avg'])

# Removing * from 'HS' (High Score) values
for row in all_batting_data:
    row['HS'] = str(row['HS']).replace("*", "")
    row['HS'] = float(row['HS'])

# Dropping the 'POS' key from each dataset and 'BBI' from all_bowling_data
for row in all_bowling_data:
    del row['POS']
    del row['BBI']
    
for row in all_batting_data:
    del row['POS']

In [None]:
info(all_batting_data)

In [None]:
# Function to convert data types to int or float as needed

def convert_data_types(data):
    for i in range(len(data)):
        for key, value in data[i].items():
            if isinstance(value , (int , float)):
               continue 
            elif '.' in value:
                # Check if value contains '.' in it, then convert to float
                try:
                    data[i][key] = float(value)  # Convert to float
                except ValueError:
                    pass
            else:
                try:
                    data[i][key] = int(value)  # Convert to int
                except ValueError:
                    pass  # Value remains as string if not numeric


In [None]:
# Convert data types in batting and bowling data
convert_data_types(all_batting_data)
convert_data_types(all_bowling_data)

In [None]:
info(all_batting_data)

In [None]:
info(all_bowling_data)

In [None]:
# Function to calculate mean, median, min, max, and count for a given column

def calculate_mean(column):
    return sum(column) / len(column)

def calculate_median(column):
    sorted_column = sorted(column)
    n = len(sorted_column)
    if n % 2 == 1:
        return sorted_column[n // 2]
    else:
        return (sorted_column[n // 2 - 1] + sorted_column[n // 2]) / 2

def calculate_min(column):
    return min(column)

def calculate_max(column):
    return max(column)

def calculate_count(column):
    return len(column)

In [None]:
# Fuction to describe the data
def describe(data):
    statistics = {}
    for column in data[0].keys():  # As all dictionaries have the same keys
        try:
            values = [item[column] for item in data]
            statistics[column] = {
                'mean': calculate_mean(values),
                'median': calculate_median(values),
                'min': calculate_min(values),
                'max': calculate_max(values),
                'count': calculate_count(values)
            }
        except:
            TypeError
    return statistics


In [None]:
# Displaying the stats
statistics_batting = describe(all_batting_data)
print(f"statistics of batting \n{statistics_batting}")

In [None]:
statistics_bowling = describe(all_bowling_data)
print(f"statistics of bowling \n{statistics_bowling}")


'Avg' column in both batting dataset and bowling dataset represents the batting average and bowling average i.e. is :

Batting Average = Runs Scored ÷ Times Out                                   

“Runs Scored” – The number of runs scored by the batter.    

“Times Out” – The number of times the batter has been caught out.  
   
---------------------------------------------------
Bowling Average = Runs Conceded ÷ Wickets Taken

“Runs Conceded” is the number of runs conceded by the bowler.

“Wickets Taken” is the number of wickets taken by the bowler.

In [None]:
# Now calculating Avg_score i.e. Average Score of the batsman , total runs divided by innings , i.e. average runs scored by a batsman per innings
# And adding a new column in batting data nameky Avg_score
def cal_batting_avg_score(player_data):
    for row in player_data:
        runs = row['Runs']
        innings = row['Inns']
        if innings == 0:
           row['Avg_score'] = 0 
        else:
            row['Avg_score'] = runs / innings

In [None]:
cal_batting_avg_score(all_batting_data)

In [None]:
info(all_batting_data)

In [None]:
info(all_bowling_data)

In [None]:
# Function to find all rounders and aggregate them.
def aggregate_allrounder_statistics(batdata , bowldata):
    player_stats = []
    for row in batdata:
        bat_name = row['Player']
        for x in bowldata:
            bowl_name = x["Player"]
            if bat_name == bowl_name:
                if row["Year"] == x["Year"]:
                    new_dict = {
                        "Year" : row["Year"],
                        "Player" : bat_name,
                        "Batting_avg" : row['Avg'],
                        "Strike_rate" : row["SR"],
                        "Batting_avg_score" : row["Avg_score"],
                        "Bowling_avg" : x["Avg"],
                        "Economy_rate" : x["Econ"],
                        "Bat_score" : (row["Avg"] + row["SR"] + row["Avg_score"]) / 3 ,    
                        "Bowl_score" : (x["Avg"] + x["Econ"]) / 2      
                    }
                    player_stats.append(new_dict)
        
    return player_stats
aggregate_allrounder_statistics(all_batting_data , all_bowling_data)

In [None]:
all_rounder_data = aggregate_allrounder_statistics(all_batting_data , all_bowling_data)

In [None]:
print(all_rounder_data)

New column is added namely overall_score of player of a year in both data i.e. 

mean of batting average(Avg) , runs per innings (Avg_score) , Strike rate (SR) ; this in batting data

mean of bowling average(Avg) , economy rate (Econ) ; this in bowling data

In [None]:
# Adding a new column in batting data named as overall_score
def cal_overall_bat_score(player_data):
    for row in player_data:
        avg = row['Avg']
        sr = row['SR']
        avg_sc = row['Avg_score']
        overall_sc = (avg + sr + avg_sc) / 3
        
        row['Overall_score'] = overall_sc
        

In [None]:
# Adding a new column in bowling data named as overall_score
def cal_overall_bowl_score(player_data):
    for row in player_data:
        avg = row['Avg']
        er = row['Econ']
        
        overall_sc = (avg + er) / 2
        
        row['Overall_score'] = overall_sc
        

In [None]:
cal_overall_bat_score(all_batting_data)
info(all_batting_data)
print("\n")
cal_overall_bowl_score(all_bowling_data)
info(all_bowling_data)

In [None]:
info(all_rounder_data)

In [None]:
for row in all_batting_data:
    print(row['Overall_score'])

In [None]:
# Function to aggregate player statistics over the years, group by players
def aggregate_player_batting_statistics(data):
    player_stats = {}
    for player in data:
        player_name = player['Player']
        if player_name not in player_stats:
            player_stats[player_name] = {
                'Batting Avg': [],
                'Batting Avg Score': [],
                'Batting Strike Rate': [],
                'Overall Score' : []
            }
        try:   
            player_stats[player_name]['Batting Avg'].append(player['Avg'])
            player_stats[player_name]['Batting Avg Score'].append(player['Avg_score'])
            player_stats[player_name]['Batting Strike Rate'].append(player['SR'])
            player_stats[player_name]['Overall Score'].append(player['Overall_score'])
        except:
            KeyError
    return player_stats

def aggregate_player_bowling_statistics(data):
    player_stats = {}
    for player in data:
        player_name = player['Player']
        if player_name not in player_stats:
            player_stats[player_name] = {
                'Bowling Econ Rate': [],
                'Bowling Avg': [],
                'Overall Score' : []
            }
        try:
            player_stats[player_name]['Bowling Econ Rate'].append(player['Econ'])
            player_stats[player_name]['Bowling Avg'].append(player['Avg'])
            player_stats[player_name]['Overall Score'].append(player['Overall_score'])
        except:
            KeyError
    return player_stats


def aggregate_player_allrounder_statistics(data):
    player_stats = {}
    for player in data:
        player_name = player['Player']
        if player_name not in player_stats:
            player_stats[player_name] = {
                'Batting Avg': [],
                'Batting Avg Score': [],
                'Batting Strike Rate': [],
                'Bowling Econ Rate': [],
                'Bowling Avg': [],
                "Bat Score" : [],
                "Bowl Score" : [],
            }
        try:   
            player_stats[player_name]['Batting Avg'].append(player['Batting_avg'])
            player_stats[player_name]['Batting Avg Score'].append(player['Batting_avg_score'])
            player_stats[player_name]['Batting Strike Rate'].append(player['Strike_rate'])
            player_stats[player_name]['Bowling Econ Rate'].append(player['Economy_rate'])
            player_stats[player_name]['Bowling Avg'].append(player['Bowling_avg'])
            player_stats[player_name]["Bat Score"].append(player['Bat_score'])
            player_stats[player_name]["Bowl Score"].append(player['Bowl_score'])
        except:
            KeyError
    return player_stats

In [None]:
# Aggregate player statistics for batting and bowling
aggregated_batting_player_stats = aggregate_player_batting_statistics(all_batting_data)
aggregated_bowling_player_stats = aggregate_player_bowling_statistics(all_bowling_data)
aggregated_allrounder_player_stats = aggregate_player_allrounder_statistics(all_rounder_data)

In [None]:
print(aggregated_bowling_player_stats)

In [None]:
print(aggregated_batting_player_stats)

In [None]:
print(aggregated_allrounder_player_stats)

In [None]:
# Function to calculate player rankings based on a given performance metric
def calculate_player_rankings(player_stats, metric):
    rankings = {}
    for player, values in player_stats.items():
        metric_values = values[metric]
        if len(metric_values) > 0:
            avg_metric = sum(metric_values) / len(metric_values)
            rankings[player] = avg_metric

    sorted_rankings = sorted(rankings.items(), key=lambda x: x[1], reverse=True)
    return sorted_rankings

In [None]:
# Calculate rankings for batting and bowling metrics
batting_avg_rankings = calculate_player_rankings(aggregated_batting_player_stats, 'Batting Avg')
batting_avg_score_rankings = calculate_player_rankings(aggregated_batting_player_stats, 'Batting Avg Score')
batting_sr_rankings = calculate_player_rankings(aggregated_batting_player_stats, 'Batting Strike Rate')

bowling_econ_rankings = calculate_player_rankings(aggregated_bowling_player_stats, 'Bowling Econ Rate')
bowling_avg_rankings = calculate_player_rankings(aggregated_bowling_player_stats, 'Bowling Avg')

allrounder_bat_rankings = calculate_player_rankings(aggregated_allrounder_player_stats , 'Bat Score')
allrounder_bowl_rankings = calculate_player_rankings(aggregated_allrounder_player_stats , 'Bowl Score')


In [None]:
print(bowling_econ_rankings)

In [None]:
# Function to get top N players from the rankings
def get_top_players(rankings, N = 40):
    return rankings[:N]
def get_bottom_players(rankings, N = 20):
    return rankings[-N:]

In [None]:
# Get top 40 players for each metric
top_batting_avg = sorted(set(get_top_players(batting_avg_rankings)), key=lambda x: x[1], reverse=True)
top_batting_avg_score = sorted(set(get_top_players(batting_avg_score_rankings)), key=lambda x: x[1], reverse=True)
top_batting_sr = sorted(set(get_top_players(batting_sr_rankings)), key=lambda x: x[1], reverse=True)
top_bowling_econ = sorted(set(get_bottom_players(bowling_econ_rankings)), key=lambda x: x[1], reverse=False)
top_bowling_avg = sorted(set(get_bottom_players(bowling_avg_rankings)), key=lambda x: x[1], reverse=False)

In [None]:
top_allrounder_batting = sorted(set(get_top_players(allrounder_bat_rankings)), key=lambda x: x[1], reverse=True)
top_allrounder_bowling = sorted(set(get_top_players(allrounder_bowl_rankings)), key=lambda x: x[1], reverse=False)

In [None]:
print(top_batting_avg)

In [None]:
print(top_batting_avg_score)

In [None]:
print(top_bowling_econ)

In [None]:
# Display top 40 players for each metric in batting and 20 player in bowling

print(f"\n----- Top {len(top_batting_avg)} Batting Averages -----\n")
for player, avg in top_batting_avg:
    print(f"Player: {player}, Batting Avg: {avg:.2f}")

print(f"\n----- Top {len(top_batting_avg_score)} Batting Average Scores -----\n")
for player, avg_score in top_batting_avg_score:
    print(f"Player: {player}, Batting Avg Score: {avg_score:.2f}")

print(f"\n----- Top {len(top_batting_sr)} Batting Strike Rates -----\n")
for player, sr in top_batting_sr:
    print(f"Player: {player}, Batting Strike Rate: {sr:.2f}")

print(f"\n----- Top {len(top_bowling_econ)} Bowling Economy Rates -----\n")
for player, econ in top_bowling_econ:
    print(f"Player: {player}, Bowling Econ Rate: {econ:.2f}")

print(f"\n----- Top {len(top_bowling_avg)} Bowling Averages -----\n")
for player, avg in top_bowling_avg:
    print(f"Player: {player}, Bowling Avg: {avg:.2f}")
    
print(f"\n----- Top {len(top_allrounder_batting)} All Rounder Batting Score Averages -----\n")
for player, avg in top_allrounder_batting:
    print(f"Player: {player}, Batting Score Avg: {avg:.2f}")
    
print(f"\n----- Top {len(top_allrounder_bowling)} All Rounder Bowling Score Averages -----\n")
for player, avg in top_allrounder_bowling:
    print(f"Player: {player}, Bowling Score Avg: {avg:.2f}")



In [None]:
# sum1 = 0
# allrounder_count_bat = 0
# allrounder_count_bowl = 0
# for name , value in top_allrounder_batting :
#     allrounder_count_bat += 1
#     sum1 += value
# mean_allrounder_bat = sum1/ allrounder_count_bat
# sum1 = 0
# for name , value in top_allrounder_bowling :
#     allrounder_count_bowl += 1
#     sum1 += value
# mean_allrounder_bowl = sum1 / allrounder_count_bowl

# print(mean_allrounder_bat)
# print(allrounder_count_bat)
# print(allrounder_count_bowl)
# print(mean_allrounder_bowl)

In [None]:
# Extract only the player names from the sets
players_set1 = {name for name, _ in top_batting_avg}
players_set2 = {name for name, _ in top_batting_avg_score}
players_set3 = {name for name, _ in top_batting_sr}
players_set4 = {name for name, _ in top_bowling_econ}
players_set5 = {name for name, _ in top_bowling_avg}
players_set6 = {name for name, _ in top_allrounder_batting[:12]}
players_set7 = {name for name, _ in top_allrounder_bowling[:12]}

# Find the common names in all five sets
overall_top_batsman = players_set1.intersection(players_set2, players_set3)
overall_top_bowler = players_set4.intersection(players_set5)
overall_top_allrounder = players_set6.intersection(players_set7)

print(overall_top_batsman)
print(len(overall_top_batsman))

In [None]:
print(overall_top_allrounder)
print(len(overall_top_allrounder))

In [None]:
print(overall_top_bowler)
print(len(overall_top_bowler))

In [None]:
# Function to plot graph of each metric (taking average of 7 years of each player) vs palyers
def plot_performance_allplayers(player_names, performance_metric, data, ylabel):
    plt.figure(figsize=(12, 8))  # Increase the figure size for better visibility
    
    # Compute average value for the given metric for each player
    average_values = [sum(data[player][performance_metric]) / len(data[player][performance_metric]) for player in player_names]
    
    # Plotting histogram
    bars = plt.bar(player_names, average_values, alpha=0.7 , width=0.8)
    plt.xlabel('Player Names')
    plt.ylabel(ylabel)
    plt.title(f'Average of 7 Years of {performance_metric} for Players')
    plt.xticks(rotation=45, ha='right')  # Tilt x-axis labels
    for bar, value in zip(bars, average_values):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{value:.2f}', ha='center', va='bottom', fontsize = 6)

    plt.show()

batsman_names_list = list(overall_top_batsman)
bowler_names_list = list(overall_top_bowler)
allrounder_names_list = list(overall_top_allrounder)

In [None]:
# No. of Top common Batsman in all metric
len(batsman_names_list)

In [None]:
# Graph in Batting Average vs Players
plot_performance_allplayers(batsman_names_list, "Batting Avg", aggregated_batting_player_stats, 'Average Batting Average')

In [None]:
plot_performance_allplayers(batsman_names_list, "Overall Score", aggregated_batting_player_stats, 'Overall Batting Score')

In [None]:
# Graph in Battin Average Score vs Players
plot_performance_allplayers(batsman_names_list, "Batting Avg Score", aggregated_batting_player_stats, 'Average Batting Average Score')

In [None]:
# Graph in Strike Rate vs Players
plot_performance_allplayers(batsman_names_list, "Batting Strike Rate", aggregated_batting_player_stats, 'Average Batting Strike Rate')

In [None]:
plot_performance_allplayers(bowler_names_list, "Overall Score", aggregated_bowling_player_stats, 'Overall Bowling Score')

In [None]:
# Graph in Bowling Avgerage vs Players
plot_performance_allplayers(bowler_names_list, "Bowling Avg", aggregated_bowling_player_stats, 'Average Bowling Average')

In [None]:
# Graph in Economy Rate vs Players
plot_performance_allplayers(bowler_names_list, "Bowling Econ Rate", aggregated_bowling_player_stats, 'Average Bowling Economy Rate')

In [None]:
plot_performance_allplayers(allrounder_names_list, "Bat Score", aggregated_allrounder_player_stats, 'Average Batting Average - All Rounders')

In [None]:
plot_performance_allplayers(allrounder_names_list, "Bowl Score", aggregated_allrounder_player_stats, 'Average Bowling Average - All Rounders')

In [None]:
# Function to plot player's performance over the years for multiple metrics
# or Historical Performance
def plot_player_performance(player_name, performance_metrics, data, ylabel):
    years = [entry['Year'] for entry in data if entry['Player'] == player_name]
    metric_values = {metric: [entry[metric] for entry in data if entry['Player'] == player_name] for metric in performance_metrics}

    plt.figure(figsize=(12, 8))  # Increase the figure size for better visibility
    
    for metric, values in metric_values.items():
        plt.plot(years, values, marker='o', label=metric)
        for i, value in enumerate(values):
            plt.text(years[i], value, f'{value:.2f}', ha='center', va='bottom', fontsize=10, color='black')
            
            
    plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability
    plt.xlabel('Year')
    plt.ylabel(ylabel)
    plt.title(f'{player_name} Performance over the Years')
    plt.legend()  # Display legend for different metrics
    plt.grid(True)  # Display grid lines
    plt.show()


performance_batting_metrics = ["Avg", "SR", "Avg_score" ]
performance_bowling_metrics = ["Econ", "Avg"]
performance_allrounder_metrics = ["Bat_score" , "Bowl_score"]

In [None]:
# Plot the graphs for batsman using the actual data directly
for player_name in overall_top_batsman:
    plot_player_performance(player_name, performance_batting_metrics, all_batting_data, 'Performance')

In [None]:
# Plot the graphs in bowlers using the actual data directly
for player_name in overall_top_bowler:
    plot_player_performance(player_name, performance_bowling_metrics, all_bowling_data, 'Performance')

In [None]:
for player_name in overall_top_allrounder:
    plot_player_performance(player_name, performance_allrounder_metrics, all_rounder_data, 'Performance')

In [None]:
batsman_names_list[:]

In [None]:
# Function to perform linear regression and plot the trend line
# Create a dictionary to store player-wise data
batsman_data_dict = {}

# Extract the data for the selected metric and year
for player_name in batsman_names_list:
    years = [entry['Year'] for entry in all_batting_data if entry['Player'] == player_name]
    batting_avg_values = [entry['Avg'] for entry in all_batting_data if entry['Player'] == player_name]
    sr_values = [entry['SR'] for entry in all_batting_data if entry['Player'] == player_name]
    batting_avg_score = [entry['Avg_score'] for entry in all_batting_data if entry['Player'] == player_name]
    
    batsman_data_dict[player_name] = {'Year': years, 'Batting Avg': batting_avg_values, 'Strike Rate': sr_values, 'Batting Avg Score': batting_avg_score}

# Perform linear regression for each player and each metric
positive_trend_batsman = []
for player_name, player_data in batsman_data_dict.items():
    X = np.array(player_data['Year'])
    metrics = ['Batting Avg', 'Strike Rate', 'Batting Avg Score']
    
    for metric in metrics:
        Y = np.array(player_data[metric])

        # Calculate the mean of X and Y
        mean_x = np.mean(X)
        mean_y = np.mean(Y)

        # Calculate the slope (coefficient)
        num = np.sum((X - mean_x) * (Y - mean_y))
        den = np.sum((X - mean_x) ** 2)
        slope = num / den

        # Check if the slope is positive
        if slope > -1:
            positive_trend_batsman.append(player_name)

        # Calculate the intercept
        intercept = mean_y - slope * mean_x

        # Print the slope and intercept for each player and metric
        print(f"Player: {player_name}, Metric: {metric}, Slope (Coefficient): {slope}, Intercept: {intercept}")

        # Create the linear regression line
        regression_line = slope * X + intercept

        # Plot the actual data points and the linear regression line for each player and metric
        plt.figure(figsize=(12, 8))
        plt.scatter(X, Y, label=metric)
        plt.plot(X, regression_line, color='red', label='Linear Regression Line')
        plt.xlabel('Year')
        plt.ylabel(metric)
        plt.title(f'{metric} Trend for {player_name}')
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
# Find unique Batsman players with a positive trend in at least two metrics
batsman_with_positive_trend = set([player for player in positive_trend_batsman if positive_trend_batsman.count(player) >= 2])
print("Batsman with a Positive Trend in at least Two Metrics:")
print("\n" , len(batsman_with_positive_trend) , "\n")
for i in batsman_with_positive_trend:
    print(i)

In [None]:
# Create a dictionary to store player-wise data
bowler_data_dict = {}

# Extract the data for the selected metric and year
for player_name in bowler_names_list:
    years = [entry['Year'] for entry in all_bowling_data if entry['Player'] == player_name]
    bowling_avg_values = [entry['Avg'] for entry in all_bowling_data if entry['Player'] == player_name]
    econ_rates = [entry['Econ'] for entry in all_bowling_data if entry['Player'] == player_name]
    
    bowler_data_dict[player_name] = {'Year': years, 'Bowling Avg': bowling_avg_values, 'Economy Rate': econ_rates}

# Perform linear regression for each player and each metric
positive_trend_bowler = []
for player_name, player_data in bowler_data_dict.items():
    X = np.array(player_data['Year'])
    metrics = ['Bowling Avg', 'Economy Rate']
    
    for metric in metrics:
        Y = np.array(player_data[metric])

        # Calculate the mean of X and Y
        mean_x = np.mean(X)
        mean_y = np.mean(Y)

        # Calculate the slope (coefficient)
        num = np.sum((X - mean_x) * (Y - mean_y))
        den = np.sum((X - mean_x) ** 2)
        slope = num / den

        # Check if the slope is positive
        if slope > -1:
            positive_trend_bowler.append(player_name)

        # Calculate the intercept
        intercept = mean_y - slope * mean_x

        # Print the slope and intercept for each player and metric
        print(f"Player: {player_name}, Metric: {metric}, Slope (Coefficient): {slope}, Intercept: {intercept}")

        # Create the linear regression line
        regression_line = slope * X + intercept

        # Plot the actual data points and the linear regression line for each player and metric
        plt.figure(figsize=(12, 8))
        plt.scatter(X, Y, label=metric)
        plt.plot(X, regression_line, color='red', label='Linear Regression Line')
        plt.xlabel('Year')
        plt.ylabel(metric)
        plt.title(f'{metric} Trend for {player_name}')
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
# Find unique Batsman players with a positive trend in at least two metrics
bowler_with_positive_trend = set([player for player in positive_trend_bowler if positive_trend_bowler.count(player) >= 2])
print("Bowler with a Positive Trend in both the Two Metrics:")
print("\n" , len(bowler_with_positive_trend))
for i in bowler_with_positive_trend:
    print(i)

In [None]:
bowler_with_positive_trend.intersection(overall_top_allrounder)

In [None]:
# Function to calculate overall score for a batsman based on selected metrics
def calculate_batsman_score(player_data):
    batting_avg_score = np.mean(player_data['Batting Avg Score'])
    strike_rate = np.mean(player_data['Strike Rate'])
    batting_avg = np.mean(player_data['Batting Avg'])
    
    # You can adjust weights based on priority or importance of metrics
    score = (batting_avg_score + strike_rate + batting_avg)/3
    return score

# Calculate overall score for each batsman and create a list of tuples (player_name, overall_score)
batsman_scores = [(player_name, calculate_batsman_score(player_data)) for player_name, player_data in batsman_data_dict.items() if player_name in batsman_with_positive_trend]

# Sort the batsmen based on their overall scores in descending order
sorted_batsmen = sorted(batsman_scores, key=lambda x: x[1], reverse=True)

# Display the combined batsman sorted list
print("Combined Overall Score of 7 Years Batsman Sorted List:")
for player_name, score in sorted_batsmen:
    print(f"Player: {player_name}, Overall Score: {score:.2f}")

In [None]:
print(sorted_batsmen)

In [None]:
# Create a dictionary to store player-wise data
batsman_data_dict = {}

# Extract the data for the selected metric and year
for player_name , player_score in sorted_batsmen:
    years = [entry['Year'] for entry in all_batting_data if entry['Player'] == player_name]
    overall_score = [entry['Overall_score'] for entry in all_batting_data if entry['Player'] == player_name]
    
    batsman_data_dict[player_name] = {'Year': years, 'Overall_score': overall_score}

# Perform linear regression for each player and each metric
positive_trend_batsman = []
for player_name, player_data in batsman_data_dict.items():
        X = np.array(player_data['Year'])
        metric = 'Overall_score'
    
        Y = np.array(player_data[metric])

        # Calculate the mean of X and Y
        mean_x = np.mean(X)
        mean_y = np.mean(Y)

        # Calculate the slope (coefficient)
        num = np.sum((X - mean_x) * (Y - mean_y))
        den = np.sum((X - mean_x) ** 2)
        slope = num / den

        # Check if the slope is positive
        if slope > -1:
            positive_trend_batsman.append(player_name)

        # Calculate the intercept
        intercept = mean_y - slope * mean_x

        # Print the slope and intercept for each player and metric
        print(f"Player: {player_name}, Metric: {metric}, Slope (Coefficient): {slope}, Intercept: {intercept}")

        # Create the linear regression line
        regression_line = slope * X + intercept

        # Plot the actual data points and the linear regression line for each player and metric
        plt.figure(figsize=(12, 8))
        plt.scatter(X, Y, label=metric)
        plt.plot(X, regression_line, color='red', label='Linear Regression Line')
        plt.xlabel('Year')
        plt.ylabel(metric)
        plt.title(f'{metric} Batting Trend for {player_name}')
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
# Function to calculate overall score for a bowler based on selected metrics
def calculate_bowler_score(player_data):
    econ_rate = np.mean(player_data['Economy Rate'])
    bowling_avg = np.mean(player_data['Bowling Avg'])
    
    score = (econ_rate + bowling_avg)/2
    return score

# Calculate overall score for each bowler and create a list of tuples (player_name, overall_score)
bowler_scores = [(player_name, calculate_bowler_score(player_data)) for player_name, player_data in bowler_data_dict.items() if player_name in bowler_with_positive_trend]

# Sort the bowlers based on their overall scores in descending order
sorted_bowlers = sorted(bowler_scores, key=lambda x: x[1], reverse=False)

# Display the combined batsman sorted list
print("Combined Bowler Sorted List:")
for player_name, score in sorted_bowlers:
    print(f"Player: {player_name}, Overall Score: {score:.2f}")

In [None]:
# Create a dictionary to store player-wise data
bowler_data_dict = {}

# Extract the data for the selected metric and year
for player_name , player_score in sorted_bowlers:
    years = [entry['Year'] for entry in all_bowling_data if entry['Player'] == player_name]
    overall_score = [entry['Overall_score'] for entry in all_bowling_data if entry['Player'] == player_name]
    
    bowler_data_dict[player_name] = {'Year': years, 'Overall_score': overall_score}

# Perform linear regression for each player and each metric
positive_trend_bowler = []
for player_name, player_data in bowler_data_dict.items():
        X = np.array(player_data['Year'])
        metric = 'Overall_score'
    
        Y = np.array(player_data[metric])

        # Calculate the mean of X and Y
        mean_x = np.mean(X)
        mean_y = np.mean(Y)

        # Calculate the slope (coefficient)
        num = np.sum((X - mean_x) * (Y - mean_y))
        den = np.sum((X - mean_x) ** 2)
        slope = num / den

        # Check if the slope is negative
        if slope < 1:
            positive_trend_bowler.append(player_name)

        # Calculate the intercept
        intercept = mean_y - slope * mean_x

        # Print the slope and intercept for each player and metric
        print(f"Player: {player_name}, Metric: {metric}, Slope (Coefficient): {slope}, Intercept: {intercept}")

        # Create the linear regression line
        regression_line = slope * X + intercept

        # Plot the actual data points and the linear regression line for each player and metric
        plt.figure(figsize=(12, 8))
        plt.scatter(X, Y, label=metric)
        plt.plot(X, regression_line, color='red', label='Linear Regression Line')
        plt.xlabel('Year')
        plt.ylabel(metric)
        plt.title(f'{metric} Bowling Trend for {player_name}')
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
# Extract only the batsman names from the combined batsman sorted list
batsman_names_sorted = [player_name for player_name, _ in sorted_batsmen]

# Extract only the batsman names from the combined bowler sorted list
bowler_names_sorted = [player_name for player_name, _ in sorted_bowlers]

# Combine both lists to get the final team
team = batsman_names_sorted[:6] + bowler_names_sorted[:5]
substitute = batsman_names_sorted[6:10] + bowler_names_sorted[5:8] + ['MS Dhoni']
# Display the names of the players in the team
print("Team Selection:")
for i, player_name in enumerate(team, 1):
    print(f"{i}. {player_name}")

# Display the names of the players in the substitute
print("\nSbstitute Selection:")
for i, player_name in enumerate(substitute, 1):
    print(f"{i}. {player_name}")

In [None]:
y = set(name for name , _ in top_batting_avg).intersection(set(name for name , _ in top_batting_avg_score))

In [None]:
print(y)

In [None]:
for row in aggregated_allrounder_player_stats.keys():
    for name in team:
        if row == name:
            print(name)

In [None]:
set(team).intersection(overall_top_allrounder)

In [None]:
set(substitute).intersection(overall_top_allrounder)

In [None]:
set(bowler_names_sorted).intersection(overall_top_allrounder)

In [None]:
set(batsman_names_sorted).intersection(overall_top_allrounder)

In [None]:
print(overall_top_allrounder)

In [None]:
print(aggregated_allrounder_player_stats)

In [None]:
print(top_batting_sr[:5])