# Universal Data Cleaning & Preparation Template

This Jupyter Notebook provides a step-by-step template for cleaning and preparing any tabular dataset for machine learning.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

### Step 1: Configuration & Data Loading

**Action Required:** Update the `file_path` variable below with the name of your CSV file. Make sure the file is in the same folder as this notebook.

In [None]:
# --- CONFIGURATION ---
file_path = 'your_dataset.csv'

# --- DATA LOADING ---
try:
    df = pd.read_csv(file_path)
    print("Data loaded successfully!")
    print("First 5 rows of your data:")
    print(df.head())
except FileNotFoundError:
    print(f"ERROR: The file '{file_path}' was not found.")

### Step 2: Data Exploration
Let's get a summary of the dataset to understand its structure, data types, and missing values.

In [None]:
# Get a summary of the DataFrame
df.info()

### Step 3: Data Cleaning & Preprocessing

This section contains automated steps to clean the data. Review the output of each cell to understand the changes made to your DataFrame.

In [None]:
# --- Fill Missing Numeric Values ---
print("--- Handling Missing Numeric Data ---")
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        mean_value = df[col].mean()
        df[col] = df[col].fillna(mean_value)
        print(f"INFO: Missing values in numeric column '{col}' filled with mean.")

In [None]:
# --- Fill Missing Categorical Values ---
print("\n--- Handling Missing Categorical Data ---")
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        mode_value = df[col].mode()[0]
        df[col] = df[col].fillna(mode_value)
        print(f"INFO: Missing values in categorical column '{col}' filled with mode.")

In [None]:
# --- Encode All Categorical Data to Numbers ---
print("\n--- Encoding All Text Data to Numbers ---")
# Create a copy to keep the original df unchanged
df_processed = df.copy() 
object_cols_to_encode = df_processed.select_dtypes(include=['object']).columns
if len(object_cols_to_encode) > 0:
    df_processed = pd.get_dummies(df_processed, columns=object_cols_to_encode, drop_first=True)
    print("SUCCESS: All text columns have been one-hot encoded.")
else:
    print("INFO: No text columns to encode.")

### Step 4: Preprocessing Complete!
The `df_processed` DataFrame now contains your fully cleaned and model-ready data.

In [None]:
# Display the first 5 rows of the final processed data
print("Head of the final, processed DataFrame:")
print(df_processed.head())