# Lets clean innings_df dataset

In [3]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import re


# Data Cleaning and Outlier Handling
This section documents the steps taken to clean the dataset and handle outliers.
first we will import the innings file

In [4]:
innings_df = pd.read_csv('E:\EDA\ICC Cricket World Cup\innings.csv')
innings_df.head()


Unnamed: 0,id,matchId,inningsNumber,runRate,overProgress,runs,wkts,ballsFaced,fours,sixes,allOut,noBallRuns,wideRuns,byeRuns,legByeRuns,penaltyRuns,battingTeamId,bowlingTeamId
0,1,3124,1,4.82,50.0,241,9,0,0,0,False,3,6,0,2,0,,
1,2,3124,2,3.95,47.2,187,10,0,0,0,True,0,2,0,2,0,,
2,3,3125,1,6.68,50.0,334,6,0,0,0,False,1,5,1,2,0,,
3,4,3125,2,3.26,40.1,131,9,0,0,0,True,1,6,0,9,0,,
4,5,3126,1,3.98,50.0,199,10,0,0,0,True,6,3,0,5,0,,


In [5]:
# Total missing values in each column
missing_values = innings_df.isnull().sum()

# Percentage of missing values in each column
missing_percentage = (missing_values / len(innings_df)) * 100

# Display null values and their percentages
missing_info = pd.concat([missing_values, missing_percentage], axis=1)
missing_info.columns = ['Missing Values', 'Percentage']
print(missing_info)

               Missing Values  Percentage
id                          0     0.00000
matchId                     0     0.00000
inningsNumber               0     0.00000
runRate                     0     0.00000
overProgress                0     0.00000
runs                        0     0.00000
wkts                        0     0.00000
ballsFaced                  0     0.00000
fours                       0     0.00000
sixes                       0     0.00000
allOut                      0     0.00000
noBallRuns                  0     0.00000
wideRuns                    0     0.00000
byeRuns                     0     0.00000
legByeRuns                  0     0.00000
penaltyRuns                 0     0.00000
battingTeamId            2081    77.59135
bowlingTeamId            2081    77.59135


We will drop the columns battingTeamId and bowlingTeamId since almost 80% values missing.

In [7]:
innings_df.drop(['battingTeamId', 'bowlingTeamId'], axis=1, inplace=True)


In [8]:
# Dropping the duplicates if any
innings_df.drop_duplicates(inplace=True)

In [9]:
innings_df.head(5)

Unnamed: 0,id,matchId,inningsNumber,runRate,overProgress,runs,wkts,ballsFaced,fours,sixes,allOut,noBallRuns,wideRuns,byeRuns,legByeRuns,penaltyRuns
0,1,3124,1,4.82,50.0,241,9,0,0,0,False,3,6,0,2,0
1,2,3124,2,3.95,47.2,187,10,0,0,0,True,0,2,0,2,0
2,3,3125,1,6.68,50.0,334,6,0,0,0,False,1,5,1,2,0
3,4,3125,2,3.26,40.1,131,9,0,0,0,True,1,6,0,9,0
4,5,3126,1,3.98,50.0,199,10,0,0,0,True,6,3,0,5,0


Our innings.csv now look clean, we don't have any missing and blank cells in it. Lets move to another file in our dataset. 

In [10]:
innings_df.head(5)

Unnamed: 0,id,matchId,inningsNumber,runRate,overProgress,runs,wkts,ballsFaced,fours,sixes,allOut,noBallRuns,wideRuns,byeRuns,legByeRuns,penaltyRuns
0,1,3124,1,4.82,50.0,241,9,0,0,0,False,3,6,0,2,0
1,2,3124,2,3.95,47.2,187,10,0,0,0,True,0,2,0,2,0
2,3,3125,1,6.68,50.0,334,6,0,0,0,False,1,5,1,2,0
3,4,3125,2,3.26,40.1,131,9,0,0,0,True,1,6,0,9,0
4,5,3126,1,3.98,50.0,199,10,0,0,0,True,6,3,0,5,0


In [11]:
innings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2682 entries, 0 to 2681
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2682 non-null   int64  
 1   matchId        2682 non-null   int64  
 2   inningsNumber  2682 non-null   int64  
 3   runRate        2682 non-null   float64
 4   overProgress   2682 non-null   float64
 5   runs           2682 non-null   int64  
 6   wkts           2682 non-null   int64  
 7   ballsFaced     2682 non-null   int64  
 8   fours          2682 non-null   int64  
 9   sixes          2682 non-null   int64  
 10  allOut         2682 non-null   bool   
 11  noBallRuns     2682 non-null   int64  
 12  wideRuns       2682 non-null   int64  
 13  byeRuns        2682 non-null   int64  
 14  legByeRuns     2682 non-null   int64  
 15  penaltyRuns    2682 non-null   int64  
dtypes: bool(1), float64(2), int64(13)
memory usage: 317.0 KB


Lets save this file in a specific folder

In [12]:
# Define the file path where you want to save the CSV files
folder_path = r'E:\EDA\ICC Cricket World Cup\cleanedFiles'

# Save the innings_df as a CSV file in the specified folder
innings_df.to_csv(folder_path + '\\innings.csv', index=False)