In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('../data/Infectious Disease 2001-2014.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\n" + "="*50 + "\n")

# 1. INITIAL DATA INSPECTION
print("1. INITIAL DATA INSPECTION")
print("="*30)

# Basic info
print("Data Types:")
print(df.dtypes)
print("\nFirst 5 rows:")
print(df.head())
print("\nLast 5 rows:")
print(df.tail())

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Loading dataset...
Dataset shape: (141777, 10)
Columns: ['Disease', 'County', 'Year', 'Sex', 'Count', 'Population', 'Rate', 'CI.lower', 'CI.upper', 'Unstable']


1. INITIAL DATA INSPECTION
Data Types:
Disease        object
County         object
Year            int64
Sex            object
Count           int64
Population      int64
Rate          float64
CI.lower      float64
CI.upper      float64
Unstable       object
dtype: object

First 5 rows:
     Disease      County  Year     Sex  Count  Population   Rate  CI.lower  \
0  Amebiasis  California  2001   Total    571    34514777  1.654     1.521   
1  Amebiasis  California  2001  Female    176    17340743  1.015     0.871   
2  Amebiasis  California  2001    Male    365    17174034  2.125     1.913   
3  Amebiasis  California  2002   Total    442    34940334  1.265     1.150   
4  Amebiasis  California  2002  Female    145    17555714  0.826     0.697   

   CI.upper Unstable  
0     1.796           
1     1.176           
2     2.355 

In [2]:
# 2. MISSING VALUE ANALYSIS
print("\n2. MISSING VALUE ANALYSIS")
print("="*30)

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_summary = pd.DataFrame({
    'Missing Values': missing_data,
    'Percentage': missing_percent
})
print("Missing values per column:")
print(missing_summary[missing_summary['Missing Values'] > 0])


2. MISSING VALUE ANALYSIS
Missing values per column:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [3]:
# 3. DATA TYPE VALIDATION AND CORRECTION
print("\n3. DATA TYPE VALIDATION")
print("="*30)

# Check current dtypes
print("Current data types:")
print(df.dtypes)

# Convert Year to datetime or keep as integer
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Ensure numeric columns are properly typed
numeric_cols = ['Count', 'Population', 'Rate', 'CI.lower', 'CI.upper']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("\nUpdated data types:")
print(df.dtypes)



3. DATA TYPE VALIDATION
Current data types:
Disease        object
County         object
Year            int64
Sex            object
Count           int64
Population      int64
Rate          float64
CI.lower      float64
CI.upper      float64
Unstable       object
dtype: object

Updated data types:
Disease        object
County         object
Year            int64
Sex            object
Count           int64
Population      int64
Rate          float64
CI.lower      float64
CI.upper      float64
Unstable       object
dtype: object


In [4]:
# 4. UNIQUE VALUE ANALYSIS
print("\n4. UNIQUE VALUE ANALYSIS")
print("="*30)

for column in ['Disease', 'County', 'Year', 'Sex']:
    unique_vals = df[column].unique()
    print(f"\n{column}:")
    print(f"  Unique values: {len(unique_vals)}")
    print(f"  Sample: {unique_vals[:10]}")
    if len(unique_vals) < 20:
        print(f"  All values: {unique_vals}")


4. UNIQUE VALUE ANALYSIS

Disease:
  Unique values: 65
  Sample: ['Amebiasis' 'Anaplasmosis and Ehrlichiosis' 'Babesiosis' 'Cholera'
 'Botulism, Other' 'Botulism, Foodborne' 'Botulism, Wound' 'Brucellosis'
 'Campylobacteriosis' 'Chlamydia']

County:
  Unique values: 59
  Sample: ['California' 'Alameda' 'Alpine' 'Amador' 'Butte' 'Calaveras' 'Colusa'
 'Contra Costa' 'Del Norte' 'El Dorado']

Year:
  Unique values: 14
  Sample: [2001 2002 2003 2004 2005 2006 2007 2008 2009 2010]
  All values: [2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014]

Sex:
  Unique values: 3
  Sample: ['Total' 'Female' 'Male']
  All values: ['Total' 'Female' 'Male']


In [5]:
# 5. CONSISTENCY CHECKS
print("\n5. DATA CONSISTENCY CHECKS")
print("="*30)

# Check if Rate calculation matches Count/Population
df['Calculated_Rate'] = (df['Count'] / df['Population']) * 100000
df['Rate_Diff'] = abs(df['Rate'] - df['Calculated_Rate'])
inconsistent_rates = df[df['Rate_Diff'] > 0.01].shape[0]
print(f"Rows with inconsistent rate calculation: {inconsistent_rates}")

# Check CI consistency
ci_inconsistent = df[(df['CI.lower'] > df['Rate']) | (df['CI.upper'] < df['Rate'])].shape[0]
print(f"Rows with CI bounds inconsistent with Rate: {ci_inconsistent}")


5. DATA CONSISTENCY CHECKS
Rows with inconsistent rate calculation: 56
Rows with CI bounds inconsistent with Rate: 0


In [6]:
# 6. OUTLIER DETECTION
print("\n6. OUTLIER DETECTION")
print("="*30)

# Using IQR method for numeric columns
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

numeric_columns = ['Count', 'Population', 'Rate']
for col in numeric_columns:
    outliers = detect_outliers_iqr(df, col)
    print(f"{col}: {len(outliers)} outliers detected")



6. OUTLIER DETECTION
Count: 25242 outliers detected
Population: 16425 outliers detected
Rate: 29139 outliers detected


In [None]:
# 7. HANDLING ZERO AND NEAR-ZERO POPULATIONS
print("\n7. SMALL POPULATION ANALYSIS")
print("="*30)

# Check for very small populations
small_pop = df[df['Population'] < 1000]
print(f"Rows with population < 1000: {len(small_pop)}")

# Check for zero counts with non-zero rates
zero_count_issues = df[(df['Count'] == 0) & (df['Rate'] > 0)]
print(f"Rows with zero count but positive rate: {len(zero_count_issues)}")