This code is written to read the dataset 

In [11]:
import pandas as pd
import matplotlib.pyplot as plt

#read the csv file and assign it to df
df = pd.read_csv('../data/WWE_History_1000.csv')

# Quick look at the data
df.head()


Unnamed: 0,Match ID,Date,Event,Winner,Loser,Title Match
0,1,01-01-2023,Royal Rumble,John Cena,AJ Styles,Yes
1,2,08-01-2023,Elimination Chamber,Roman Reigns,Seth Rollins,No
2,3,15-01-2023,WrestleMania,Brock Lesnar,Roman Reigns,Yes
3,4,22-01-2023,Backlash,Seth Rollins,Brock Lesnar,No
4,5,29-01-2023,Money in the Bank,Randy Orton,John Cena,Yes


In [12]:
# converting date into datetime 
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')

# Check for invalid or missing dates
df['Date'].isna().sum()


np.int64(0)

Standardize Text Columns (Trim Spaces + Casing)

In [13]:
# List of text columns to clean
text_columns = ['Event', 'Winner', 'Loser']

for col in text_columns:
    df[col] = (
        df[col]
        .astype(str)        # Ensure string type
        .str.strip()        # Remove leading/trailing spaces
        .str.title()        # Standardize casing (e.g., john cena -> John Cena)
    )

df.head()


Unnamed: 0,Match ID,Date,Event,Winner,Loser,Title Match
0,1,2023-01-01,Royal Rumble,John Cena,Aj Styles,Yes
1,2,2023-01-08,Elimination Chamber,Roman Reigns,Seth Rollins,No
2,3,2023-01-15,Wrestlemania,Brock Lesnar,Roman Reigns,Yes
3,4,2023-01-22,Backlash,Seth Rollins,Brock Lesnar,No
4,5,2023-01-29,Money In The Bank,Randy Orton,John Cena,Yes


Check & Fix Inconsistent Event Names
First: Inspect event name frequency

In [14]:
# View most common event names
df['Event'].value_counts().head(20)

Event
Royal Rumble           111
Elimination Chamber    111
Wrestlemania           111
Backlash               111
Money In The Bank      111
Summerslam              89
Survivor Series         89
Extreme Rules           89
Hell In A Cell          89
Fastlane                89
Name: count, dtype: int64

Map inconsistent names to standard names

In [15]:
# Event name standardization map
event_mapping = {
    "Wrestle Mania": "Wrestlemania",
    "Wm": "Wrestlemania",
    "Royal Rumble Event": "Royal Rumble"
}

# Apply mapping
df['Event'] = df['Event'].replace(event_mapping)


Check for Missing Winner or Loser Names and Remove rows where Winner or Loser is missing

In [16]:
# Check missing values in Winner and Loser
df[['Winner', 'Loser']].isna().sum()
# Drop rows with missing winner or loser
df = df.dropna(subset=['Winner', 'Loser'])

Remove Invalid Matches (Winner == Loser)

In [17]:
# Identify invalid rows
invalid_matches = df[df['Winner'] == df['Loser']]

# Number of invalid matches
len(invalid_matches)
# Remove invalid matches
df = df[df['Winner'] != df['Loser']]
df.head()

Unnamed: 0,Match ID,Date,Event,Winner,Loser,Title Match
0,1,2023-01-01,Royal Rumble,John Cena,Aj Styles,Yes
1,2,2023-01-08,Elimination Chamber,Roman Reigns,Seth Rollins,No
2,3,2023-01-15,Wrestlemania,Brock Lesnar,Roman Reigns,Yes
3,4,2023-01-22,Backlash,Seth Rollins,Brock Lesnar,No
4,5,2023-01-29,Money In The Bank,Randy Orton,John Cena,Yes


Convert Title Match to Binary

In [18]:
# Convert Title Match to binary
df['Title Match'] = df['Title Match'].map({
    'Yes': 1,
    'No': 0
})
# Fill missing title match values with 0 (non-title match)
df['Title Match'] = df['Title Match'].fillna(0).astype(int)


Final Data Quality Check

In [19]:
# Check final structure
df.info()

# Confirm no missing critical values
df.isna().sum()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 855 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Match ID     855 non-null    int64         
 1   Date         855 non-null    datetime64[ns]
 2   Event        855 non-null    object        
 3   Winner       855 non-null    object        
 4   Loser        855 non-null    object        
 5   Title Match  855 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 46.8+ KB


Unnamed: 0,Match ID,Date,Event,Winner,Loser,Title Match
0,1,2023-01-01,Royal Rumble,John Cena,Aj Styles,1
1,2,2023-01-08,Elimination Chamber,Roman Reigns,Seth Rollins,0
2,3,2023-01-15,Wrestlemania,Brock Lesnar,Roman Reigns,1
3,4,2023-01-22,Backlash,Seth Rollins,Brock Lesnar,0
4,5,2023-01-29,Money In The Bank,Randy Orton,John Cena,1


In [20]:
# Save cleaned dataset inside data folder
df.to_csv("../data/WWE_History_1000_Cleaned.csv", index=False)