In [None]:
# Comprehensive Data Preprocessing
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'hist_nv.csv'
df = pd.read_csv(file_path)

# Show initial info
display(df.head())
display(df.info())

# 1. Handle missing values
missing_summary = df.isnull().sum()
print('Missing values per column:')
print(missing_summary)

# Fill missing numeric values with median, categorical with mode
def fill_missing(df):
    for col in df.columns:
        if df[col].dtype in [np.float64, np.int64]:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])
    return df

df = fill_missing(df)

# 2. Convert data types if needed
def convert_types(df):
    for col in df.columns:
        # Try to convert to numeric, ignore errors
        df[col] = pd.to_numeric(df[col], errors='ignore')
    return df

df = convert_types(df)

# 3. Remove duplicates
df = df.drop_duplicates()

# 4. Strip whitespace from string columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.strip()

# 5. Standardize column names
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]

# Show cleaned data
display(df.head())
display(df.describe(include='all'))