In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('Dog_Bite_Dataset.csv')

  df = pd.read_csv('Dog_Bite_Dataset.csv')


In [3]:
# Preview the dataframe structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76431 entries, 0 to 76430
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Bite Number          76431 non-null  object 
 1   Bite Type            76431 non-null  object 
 2   Incident Date        76431 non-null  object 
 3   Victim Age           50756 non-null  object 
 4   Victim Relationship  70373 non-null  object 
 5   Bite Location        65905 non-null  object 
 6   Bite Severity        66597 non-null  object 
 7   Bite Circumstance    64885 non-null  object 
 8   Controlled By        43675 non-null  object 
 9   Treatment Cost       56308 non-null  float64
 10  Date Reported        75513 non-null  object 
 11  Incident Location    74152 non-null  object 
dtypes: float64(1), object(11)
memory usage: 7.0+ MB


In [4]:
# Preview the first few rows
df.head()

Unnamed: 0,Bite Number,Bite Type,Incident Date,Victim Age,Victim Relationship,Bite Location,Bite Severity,Bite Circumstance,Controlled By,Treatment Cost,Date Reported,Incident Location
0,B17-009452,CONTACT,2020 Sep 08 07:45:00 AM,,,,,,,0.0,2017 Sep 09 12:00:00 AM,"2405 HARTLINE DR ,DALLAS TX 75228"
1,B17-009452,CONTACT,2020 Sep 08 07:45:00 AM,,,,,,,0.0,2017 Sep 09 12:00:00 AM,"2405 HARTLINE DR ,DALLAS TX 75228"
2,B17-009452,CONTACT,2020 Sep 08 07:45:00 AM,50.0,NEIGHBOR,R HAND,MINOR,BITE/SCRAT,,0.0,2017 Sep 09 12:00:00 AM,"2405 HARTLINE DR ,DALLAS TX 75228"
3,B18-010201,BITE,2020 Feb 18 06:00:00 PM,,OWNED,HANDS,PUNCTURE,HUGGING,OWNER,0.0,2018 Feb 19 12:00:00 AM,"11737 FERNALD AVE ,DALLAS TX 75218"
4,B20-014629,BITE,2020 Jun 10 01:00:00 PM,,STRANGER,L HAND,PUNCTURE,PETTING,OTHER,0.0,2020 Jun 10 12:00:00 AM,"8100 DORAN CIR ,DALLAS TX 75238"


In [5]:
# Calculate missing values percentage for each column
missing_percent = (df.isnull().sum() / len(df)) * 100
print("Missing Value Percentage: ")
print(missing_percent[missing_percent > 0].sort_values(ascending=False))

Missing Value Percentage: 
Controlled By          42.856956
Victim Age             33.592391
Treatment Cost         26.328322
Bite Circumstance      15.106436
Bite Location          13.771899
Bite Severity          12.866507
Victim Relationship     7.926103
Incident Location       2.981774
Date Reported           1.201083
dtype: float64


In [6]:
# Date Conversion and Feature Creation
date_cols = ['Incident Date', 'Date Reported ']
for col in date_cols:
    # Convert to datetime objects, coercing errors to NaT
    df[col] = pd.to_datetime(
        df[col], 
        format='%Y %b %d %I:%M:%S %p', # Specifies Year Month Day Hour:Min:Sec AM/PM
        errors='coerce'
    )

In [7]:
# New feature: Report Delay (Days)
# Calculate the difference in days between report and incident
df['Report Delay (Days)'] = (df['Date Reported '] - df['Incident Date']).dt.days
# Cap the delay at 0 (meaning no negative delays from data errors)
df['Report Delay (Days)'] = df['Report Delay (Days)'].apply(lambda x: max(0, x))

In [8]:
# New Features: Time-based features (from Incident Date)
df['Day of Week'] = df['Incident Date'].dt.day_name()
df['Incident Hour'] = df['Incident Date'].dt.hour

# Function to categorize time of day
def categorize_time(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['Time of Day'] = df['Incident Hour'].apply(categorize_time)
df.drop(columns=['Incident Hour'], inplace=True) # Drop the helper column

In [9]:
# Victim Age Cleaning
# Convert to string first, replace non-numeric garbage (e.g., '1 YR', '2 yrs') with NaN
# We focus on converting valid ages to numeric and ignoring complex strings for now
df['Victim Age'] = pd.to_numeric(
    df['Victim Age'].astype(str).str.extract(r'(\d+)')[0], # Extract digits only
    errors='coerce' # Convert non-numeric (including original NaNs and complex strings) to NaN
)

# Replace missing 'Victim Age' with the median age
median_age = df['Victim Age'].median()
df['Victim Age'] = df['Victim Age'].fillna(median_age) # Fix 1: Direct assignment

# Convert to integer (ages are whole numbers)
df['Victim Age'] = df['Victim Age'].astype(int)

In [10]:
# Incident Location Extraction
# Fill NaNs in location column with an empty string to avoid errors
df['Incident Location'] = df['Incident Location'].fillna('')

# Extract City (using the last word before the comma as a primary proxy)
df['City'] = df['Incident Location'].apply(
    lambda x: x.split(',')[0].strip().split(' ')[-1] if ',' in x else 'UNKNOWN'
)

# Extract State (2-letter code)
# This regex looks for a 2-letter uppercase code, sometimes followed by a ZIP code.
df['State'] = df['Incident Location'].str.extract(r'(\s[A-Z]{2}\s|\s[A-Z]{2}\d{5})')
df['State'] = df['State'].str.strip().str[:2].fillna('UNKNOWN')

# Drop the original location column as City/State are extracted
df.drop(columns=['Incident Location'], inplace=True)

In [11]:
# Text Standardization and Imputation
text_cols = ['Victim Relationship', 'Bite Location', 'Bite Severity', 'Bite Circumstance', 'Controlled By', 'Bite Type']

for col in text_cols:
    # Standardize text: convert to uppercase, strip leading/trailing spaces
    df[col] = df[col].astype(str).str.upper().str.strip()
    # Replace the string 'NAN' (resulting from filling NaNs) with 'UNKNOWN'
    df[col] = df[col].replace('NAN', 'UNKNOWN')

In [12]:
# Treatment Cost and Missing Values
# Treatment Cost: Impute with 0, assuming missing cost records mean no official cost was logged (often common for '0' in this data)
df['Treatment Cost'] = df['Treatment Cost'].fillna(0)

In [13]:
# Drop rows where Incident Date could not be parsed (very few after robust coercion)
df = df.dropna(subset=['Incident Date'])

In [14]:
# Export to csv
df.to_csv('Preprocessed_Dog_Bite_Dataset.csv', index=False)