In [2]:
# Code: Data Collection and Synthetic Data Generation

# Import necessary libraries
import pandas as pd
import numpy as np

# Load the original dataset
original_file_path = "path_to_original_file.xlsx"  # Replace with your file path
original_data = pd.ExcelFile(r"C:\Users\91877\Desktop\cricket_ML\Complete_Mohammed_Shami_Data.xlsx").parse(sheet_name=0)  # Load the first sheet

# Function to generate synthetic data based on statistical patterns in the dataset
def generate_synthetic_data(original_data, num_samples=50):
    synthetic_data = []
    
    for _ in range(num_samples):
        # Randomly sample from existing data distributions for key metrics
        overs = np.random.uniform(2, 10)  # Overs typically range from 2 to 10
        maidens = np.random.randint(0, 2) if overs > 5 else 0  # Maiden overs are rare
        runs = np.random.uniform(overs * 4, overs * 12)  # Runs depend on overs
        wickets = np.random.choice([0, 1, 2, 3, 4])  # Wickets distribution
        economy = runs / overs  # Calculate economy rate
        position = np.random.randint(1, 5)  # Position in the bowling order
        innings = np.random.choice([1, 2])  # First or second innings
        opposition = np.random.choice(original_data["Opposition"].unique())  # Random opponent
        ground = np.random.choice(original_data["Ground"].unique())  # Random ground
        start_date = pd.Timestamp(np.random.choice(pd.date_range("2015-01-01", "2023-01-01")))  # Random date
        format_type = np.random.choice(original_data["Format"].unique())  # T20I or ODI

        synthetic_data.append([
            "Mohammed_shami", overs, maidens, round(runs), wickets, round(economy, 2),
            position, innings, opposition, ground, start_date.date(), format_type
        ])
    
    return pd.DataFrame(synthetic_data, columns=original_data.columns)

# Generate synthetic data
num_synthetic_samples = 50
synthetic_data = generate_synthetic_data(original_data, num_samples=num_synthetic_samples)

# Combine original and synthetic data
expanded_data = pd.concat([original_data, synthetic_data], ignore_index=True)

# Save the expanded dataset to a file
output_file_path = "Expanded_Mohammed_Shami_Data.xlsx"  # Output file name
expanded_data.to_excel(output_file_path, index=False)

print(f"Expanded dataset saved to {output_file_path}")


Expanded dataset saved to Expanded_Mohammed_Shami_Data.xlsx


In [3]:
from IPython.display import FileLink

# Provide a downloadable link for the output file
output_file_path = "Expanded_Mohammed_Shami_Data.xlsx"  # Replace with the actual path if different
display(FileLink(output_file_path))


In [11]:
# Import necessary libraries
import pandas as pd

# Load the expanded dataset
file_path = "Expanded_Mohammed_Shami_Data.xlsx"  # Replace with the actual file path
data = pd.read_excel(r"C:\Users\91877\Desktop\cricket_ML\Complete_Mohammed_Shami_Data.xlsx")


# Display the first few rows to inspect the dataset
data.head()



Unnamed: 0,Player_name,Overs,Maidens,Runs,Wickets,Economy,Position,Innings,Opposition,Ground,Start Date,Format
0,Mohammed_shami,4.0,0,46,0,11.5,3,2,v Australia,Canberra,4 Dec 2020,T20I
1,Mohammed_shami,4.0,0,25,1,6.25,3,1,v Pakistan,Melbourne,23 Oct 2022,T20I
2,Mohammed_shami,4.0,0,27,1,6.75,3,2,v Netherlands,Sydney,27 Oct 2022,T20I
3,Mohammed_shami,4.0,0,13,1,3.25,3,2,v South Africa,Perth,30 Oct 2022,T20I
4,Mohammed_shami,3.0,0,25,1,8.33,3,2,v Bangladesh,Adelaide,2 Nov 2022,T20I


In [14]:
import numpy as np

# Add random noise to numerical features
# Adding Gaussian noise (mean=0, std=10% of the column's standard deviation) to these columns
columns_to_add_noise = ["Overs", "Runs", "Wickets", "Economy"]

for col in columns_to_add_noise:
    std_dev = data[col].std() * 0.1  # 10% of standard deviation
    noise = np.random.normal(0, std_dev, size=len(data))
    data[col] += noise  # Add noise to the column

# Round columns that should remain integers (like Wickets)
data["Wickets"] = data["Wickets"].round()

# Display the data to verify noise has been added
data.head()
# Add some blank rows and introduce missing values randomly
# Introduce missing values in key columns: Overs, Runs, Wickets, Economy
rows_with_noise = np.random.choice(data.index, size=5, replace=False)  # Select 5 random rows
for row in rows_with_noise:
    data.loc[row, "Overs"] = np.nan
    data.loc[row, "Wickets"] = np.nan
    data.loc[row, "Economy"] = np.nan

# Add 3 blank rows by appending NaN rows
blank_rows = pd.DataFrame(np.nan, index=range(3), columns=data.columns)  # Create 3 blank rows
data_with_blanks = pd.concat([data, blank_rows], ignore_index=True)

# Display the dataset with missing values and blanks
print("Dataset with introduced noise and blank rows:")
print(data_with_blanks.tail(10))  # Show the last 10 rows for verification


Dataset with introduced noise and blank rows:
       Player_name      Overs  Maidens       Runs  Wickets   Economy  \
13  Mohammed_shami   7.786827      1.0  34.619091      2.0  4.623962   
14  Mohammed_shami  10.097761      0.0  65.712845      0.0  6.516996   
15  Mohammed_shami        NaN      0.0  42.666868      NaN       NaN   
16  Mohammed_shami  10.496455      0.0  58.256555      3.0  5.790962   
17  Mohammed_shami        NaN      0.0  49.577114      NaN       NaN   
18  Mohammed_shami        NaN      0.0  58.382108      NaN       NaN   
19  Mohammed_shami        NaN      0.0  74.193452      NaN       NaN   
20             NaN        NaN      NaN        NaN      NaN       NaN   
21             NaN        NaN      NaN        NaN      NaN       NaN   
22             NaN        NaN      NaN        NaN      NaN       NaN   

    Position  Innings    Opposition     Ground   Start Date Format  
13       2.0      2.0  v Bangladesh  Melbourne  19 Mar 2015    ODI  
14       1.0      1.0  

In [17]:
# Check for missing values in the dataset
missing_values = data_with_blanks.isnull().sum()

# Display columns with missing values
print("Missing values in each column:")
print(missing_values)

# Display the total number of missing values
total_missing = missing_values.sum()
print(f"\nTotal missing values in the dataset: {total_missing}")


Missing values in each column:
Player_name     3
Overs          16
Maidens         3
Runs            3
Wickets        16
Economy        16
Position        3
Innings         3
Opposition      3
Ground          3
Start Date      3
Format          3
dtype: int64

Total missing values in the dataset: 75


In [25]:
# Fill numerical columns with mean or median
data["Overs"] = data["Overs"].fillna(data["Overs"].mean())
data["Wickets"] = data["Wickets"].fillna(data["Wickets"].median())
data["Economy"] = data["Economy"].fillna(data["Economy"].mean())
data["Position"] = data["Position"].fillna(data["Position"].mode()[0])
data["Innings"] = data["Innings"].fillna(data["Innings"].mode()[0])


In [26]:
# Fill categorical columns with mode
data["Opposition"] = data["Opposition"].fillna(data["Opposition"].mode()[0])
data["Ground"] = data["Ground"].fillna(data["Ground"].mode()[0])
data["Format"] = data["Format"].fillna(data["Format"].mode()[0])


In [27]:
# Fill missing dates with the median date
data["Start Date"] = data["Start Date"].fillna(data["Start Date"].median())


TypeError: Cannot convert ['4 Dec 2020' '23 Oct 2022' '27 Oct 2022' '30 Oct 2022' '2 Nov 2022'
 '6 Nov 2022' '10 Nov 2022' '18 Jan 2015' '20 Jan 2015' '30 Jan 2015'
 '15 Feb 2015' '22 Feb 2015' '6 Mar 2015' '19 Mar 2015' '26 Mar 2015'
 '12 Jan 2019' '15 Jan 2019' '18 Jan 2019' '27 Nov 2020' '29 Nov 2020'] to numeric