In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# Custom transformer to preserve column names with SimpleImputer
class NamedSimpleImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy="mean", fill_value=None):
        self.strategy = strategy
        self.fill_value = fill_value
        self.imputer = SimpleImputer(strategy=self.strategy, fill_value=self.fill_value)
    
    def fit(self, X, y=None):
        self.imputer.fit(X, y)
        return self
    
    def transform(self, X):
        X_filled = self.imputer.transform(X)
        return pd.DataFrame(X_filled, columns=X.columns)

In [3]:
# Load the datasets
batsman_data = pd.read_csv('C:/Users/Balu/Desktop/Python learning/cricket/Batsman_Data.csv')
bowler_data = pd.read_csv('C:/Users/Balu/Desktop/Python learning/cricket/Bowler_data.csv')
ground_averages = pd.read_csv('C:/Users/Balu/Desktop/Python learning/cricket/Ground_Averages.csv')
match_results = pd.read_csv('C:/Users/Balu/Desktop/Python learning/cricket/ODI_Match_Results.csv')
match_totals = pd.read_csv('C:/Users/Balu/Desktop/Python learning/cricket/ODI_Match_Totals.csv')
wc_players = pd.read_csv('C:/Users/Balu/Desktop/Python learning/cricket/WC_players.csv')

In [4]:
print(batsman_data)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(bowler_data)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(ground_averages)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(match_results)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(match_totals)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(wc_players)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")

       Unnamed: 0  Bat1 Runs  BF     SR 4s 6s    Opposition     Ground  \
0               1   DNB    -   -      -  -  -       v India     Nagpur   
1               2   DNB    -   -      -  -  -       v India    Kolkata   
2               3   DNB    -   -      -  -  -       v India      Delhi   
3               4   DNB    -   -      -  -  -  v Bangladesh      Dhaka   
4               5   DNB    -   -      -  -  -       v India      Dhaka   
...           ...   ...  ...  ..    ... .. ..           ...        ...   
11144       11145    46   46  61  75.40  6  1     v Ireland   Dehradun   
11145       11146     3    3   6  50.00  0  0     v Ireland   Dehradun   
11146       11147     1    1   8  12.50  0  0     v Ireland   Dehradun   
11147       11148   DNB    -   -      -  -  -    v Scotland  Edinburgh   
11148       11149  TDNB    -   -      -  -  -     v Ireland    Belfast   

        Start Date    Match_ID         Batsman  Player_ID  
0      18 Dec 2009  ODI # 2933   Oshane Thomas     

In [9]:
# Preprocess the data
# Batsman Data
batsman_data = batsman_data[['Bat1', 'Runs', 'BF', 'SR', '4s', '6s']]
batsman_data['Bat1'] = pd.to_numeric(batsman_data['Bat1'], errors='coerce')
batsman_data['Bat1'] = batsman_data['Bat1'].fillna(0)
batsman_data['SR'] = pd.to_numeric(batsman_data['SR'], errors='coerce')

# Bowler Data
bowler_data = bowler_data[['Overs', 'Mdns', 'Runs', 'Wkts', 'Econ', 'Ave', 'SR']]

# Ground Averages Data
ground_averages = ground_averages[['Ground', 'Span', 'Mat', 'Won', 'Tied', 'NR']]

# ODI Match Results Data
match_results = match_results[['Result', 'Margin', 'BR', 'Toss', 'Bat']]

# ODI Match Totals Data
match_totals = match_totals[['Score', 'Overs', 'RPO', 'Target', 'Inns']]

# WC Players Data
wc_players = wc_players[['Player', 'ID', 'Country']]

In [10]:
print(batsman_data)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(bowler_data)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(ground_averages)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(match_results)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(match_totals)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")
print(wc_players)
print("_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _")

       Bat1  Runs   BF    SR  4s  6s
0       0.0     0    0   NaN   0   0
1       0.0     0    0   NaN   0   0
2       0.0     0    0   NaN   0   0
3       0.0     0    0   NaN   0   0
4       0.0     0    0   NaN   0   0
...     ...   ...  ...   ...  ..  ..
11144  46.0   121  121  75.4  22   2
11145   3.0   103  119  50.0   1   1
11146   1.0     2  141  12.5   1   1
11147   0.0     0    0   NaN   0   0
11148   0.0     0    0   NaN   0   0

[11149 rows x 6 columns]
_ _ _ _ _ _ _ _ _ _ _ _  _ _ _ _  _  _ _ _   _ _ _ _ _  _ _ _  __ _ _ _ ___ _ _ _ _ _ _ _ _ _ _ _ _ __  _ _  _ _  _
       Overs  Mdns  Runs  Wkts  Econ  Ave  SR
0         49     1    59     1   412    0   0
1         12     1    57     3   316  103  77
2          0     0     0     0     0    0   0
3         55     2    66     3   406  120  70
4         49     2    49     1   349    0   0
...      ...   ...   ...   ...   ...  ...  ..
11113     25     1    42     1    35    0   0
11114     45     1    19     3   112    5  60


In [12]:
# Convert non-numeric columns to numeric using label encoding
label_encoder = LabelEncoder()
for dataset in [batsman_data, bowler_data, ground_averages, match_results, match_totals, wc_players]:
    for column in dataset.select_dtypes(include='object').columns:
        dataset[column] = label_encoder.fit_transform(dataset[column])

In [13]:
# Split the data and train the model for each dataset
datasets = {
    'Batsman Data': batsman_data,
    'Bowler Data': bowler_data,
    'Ground Averages': ground_averages,
    'Match Results': match_results,
    'Match Totals': match_totals,
    'WC Players': wc_players
}

In [14]:
common_columns = None  # To store common columns

for dataset_name, dataset in datasets.items():
    print(f"--- {dataset_name} ---\n")

    # Remove rows with missing values
    dataset = dataset.dropna()

    # Remove unnecessary columns
    if 'Runs' in dataset.columns:
        dataset = dataset.drop('Runs', axis=1)
        dataset = dataset.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    X = dataset.dropna().dropna()
    y = dataset.dropna().dropna()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = make_pipeline(NamedSimpleImputer(strategy='mean'), LinearRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print("Mean Squared Error:", mse)
    print("Mean Absolute Error:", mae)
    print()

    # Make predictions on new data
    new_data = pd.DataFrame({'Bat1': [50], 'BF': [80], 'SR': [125], '4s': [10], '6s': [3]})
    new_data = new_data.reindex(columns=X.columns, fill_value=0)

    y_new_pred = model.predict(new_data)

    print(f"Predicted Performance for {dataset_name}:")
    performance_names = list(X.columns)
    performance_values = list(y_new_pred[0])
    for name, value in zip(performance_names, performance_values):
        print(f"{name}: {value}")
    print()

print("Note: The Mean Squared Error (MSE) and Mean Absolute Error (MAE) indicate the model's performance in terms of accuracy. A lower value indicates better performance. The 'Predicted Performance' section provides the predicted values for the given input data.")

--- Batsman Data ---

Mean Squared Error: 2.330995698222678e-26
Mean Absolute Error: 8.102229317064312e-14

Predicted Performance for Batsman Data:
Bat1: 50.000000000000284
BF: 80.0
SR: 125.00000000000004
4s: 9.999999999999938
6s: 3.000000000000002

--- Bowler Data ---

Mean Squared Error: 3.6945225075085785e-24
Mean Absolute Error: 9.173495566073661e-13

Predicted Performance for Bowler Data:
Overs: 5.903752928095797e-13
Mdns: 5.5039306445792135e-14
Wkts: 6.054184931159057e-16
Econ: 3.491068545358189e-12
Ave: -1.0571404862602662e-13
SR: 124.99999999999429

--- Ground Averages ---

Mean Squared Error: 1.7370219705130533e-28
Mean Absolute Error: 6.61841155823588e-15

Predicted Performance for Ground Averages:
Ground: -4.973799150320701e-14
Span: 1.0658141036401503e-14
Mat: 5.329070518200751e-15
Won: -6.217248937900877e-15
Tied: 4.85722573273506e-17
NR: 2.220446049250313e-16

--- Match Results ---

Mean Squared Error: 3.0106511630027942e-28
Mean Absolute Error: 9.339899168549616e-15

Pre

In [1]:
print("Thank You🙏")

Thank You🙏
