In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset
data = pd.read_csv('Batsman_Data.csv')  # Replace 'path_to_your_dataset.csv' with the actual file path

# Data Preprocessing (handle missing values)
data.replace('-', pd.NA, inplace=True)  # Replace '-' with NaN

# Convert relevant columns to numeric type
numeric_columns = ['Runs', 'BF', 'SR', '4s', '6s']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric)

# Drop rows with missing values
data.dropna(inplace=True)

# Feature Selection
selected_features = ['BF', 'SR', '4s', '6s']
X = data[selected_features]
y = data['Runs']

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Training
model = LinearRegression()  # You can try other regression models as well
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R-squared (R2): {r2}')

# Prediction
# Now you can use the trained model to predict runs for new data.
new_data = pd.DataFrame([[100, 120.0, 12, 2]], columns=selected_features)  # Replace with actual input statistics
predicted_runs = model.predict(new_data)
print(f'Predicted Runs: {predicted_runs[0]}')


Mean Squared Error (MSE): 29.70125398653453
Mean Absolute Error (MAE): 3.8900591361790227
R-squared (R2): 0.9700316215750563
Predicted Runs: 106.30179729029805


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset
data = pd.read_csv('Bowler_Data.csv')

# Convert '-' to NaN
data.replace('-', np.nan, inplace=True)

# Drop rows with missing target values (Ave)
data.dropna(subset=['Ave'], inplace=True)

# Convert columns to numeric types
data['Overs'] = pd.to_numeric(data['Overs'], errors='coerce')
data['Mdns'] = pd.to_numeric(data['Mdns'], errors='coerce')
data['Runs'] = pd.to_numeric(data['Runs'], errors='coerce')
data['Wkts'] = pd.to_numeric(data['Wkts'], errors='coerce')
data['Econ'] = pd.to_numeric(data['Econ'], errors='coerce')
data['Ave'] = pd.to_numeric(data['Ave'], errors='coerce')
data['SR'] = pd.to_numeric(data['SR'], errors='coerce')

# Drop unnecessary columns
data.drop(columns=['Opposition', 'Ground', 'Start Date', 'Match_ID', 'Bowler', 'Player_ID'], inplace=True)

# Feature selection
X = data[['Overs', 'Mdns', 'Runs', 'Wkts', 'Econ']]
y = data['Ave']

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

# Example: To predict the average for a new player with features [8, 0, 57, 0, 7.12]
new_data = pd.DataFrame({'Overs': [8], 'Mdns': [0], 'Runs': [57], 'Wkts': [0], 'Econ': [7.12]})
predicted_average = model.predict(new_data)
print("Predicted Average:", predicted_average[0])

Mean Squared Error (MSE): 48.2826294270873
Mean Absolute Error (MAE): 5.509262588546401
R-squared (R2): 0.8224616357654679
Predicted Average: 58.345912291933566
