In [2]:
# ==========================
# PREPROCESSING - CELL 1
# ==========================

import pandas as pd
pd.set_option("display.max_columns", None)

# Load dataset
df = pd.read_csv("../data/Faculty_Attrition_Dataset.csv")

print("Initial shape:", df.shape)

# 1. Remove exact duplicate rows (full row duplicates)
exact_dupes = df.duplicated().sum()
print("Exact duplicate rows:", exact_dupes)
df = df.drop_duplicates().reset_index(drop=True)

# 2. Check duplicate faculty_id
id_dupes = df["faculty_id"].duplicated().sum()
print("Duplicate faculty_id:", id_dupes)

# DO NOT DROP faculty_id yet
display(df.head())


Initial shape: (15000, 17)
Exact duplicate rows: 0
Duplicate faculty_id: 0


Unnamed: 0,faculty_id,academic_rank,tenure_status,years_at_institution,base_salary,teaching_load,research_funding,institution_type,department_size,admin_support,work_life_balance,department_collaboration,promotion_opportunities,publications_last_3_years,student_evaluation_avg,job_market_alternatives,left_institution
0,1,Associate Professor,Tenure-Track,3,70974.758663,6,9960.082453,Community College,Medium,1,3,8,1,7,3.520074,9,0
1,2,Full Professor,Non-Tenure,16,69482.774557,3,108309.53613,Liberal Arts College,Medium,3,3,2,9,4,3.811163,4,0
2,3,Lecturer,Non-Tenure,19,91434.392986,2,10044.46801,Research University,Medium,9,5,9,3,4,3.539667,7,0
3,4,Associate Professor,Non-Tenure,24,65850.710135,6,4159.726828,Research University,Small,5,7,6,1,5,4.351417,9,1
4,5,Associate Professor,Non-Tenure,39,30000.0,7,10718.111809,Community College,Small,6,6,8,8,5,3.932108,2,0


In [3]:
# ==========================
# PREPROCESSING - CELL 2
# Train-test split BEFORE preprocessing
# ==========================

from sklearn.model_selection import train_test_split

target = "left_institution"

X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

# NOW drop faculty_id (AFTER split → prevents leakage check)
X_train = X_train.drop(columns=["faculty_id"])
X_test  = X_test.drop(columns=["faculty_id"])

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("\nTrain target distribution:")
print(y_train.value_counts(normalize=True))



Train shape: (12000, 15)
Test shape: (3000, 15)

Train target distribution:
left_institution
0    0.726667
1    0.273333
Name: proportion, dtype: float64


In [4]:
# ==========================
# PREPROCESSING - CELL 3
# Identify numeric & categorical columns
# ==========================

categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X_train.select_dtypes(include=["number"]).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)


Categorical columns: ['academic_rank', 'tenure_status', 'institution_type', 'department_size']
Numeric columns: ['years_at_institution', 'base_salary', 'teaching_load', 'research_funding', 'admin_support', 'work_life_balance', 'department_collaboration', 'promotion_opportunities', 'publications_last_3_years', 'student_evaluation_avg', 'job_market_alternatives']


In [5]:
# ==========================
# PREPROCESSING - CELL 4
# Ensure no previous flags exist
# ==========================

for col in X_train.columns:
    if col.endswith("_missing_flag") or col.endswith("_outlier_flag"):
        raise ValueError("Dataset contains leftover flag columns — remove them from raw data.")


In [6]:
# ==========================
# PREPROCESSING - CELL 5
# Custom numeric preprocessing transformer
# ==========================

from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class NumericPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.medians = {}
        self.clip_vals = {}
    
    def fit(self, X, y=None):
        X = pd.DataFrame(X, columns=numeric_cols)

        # Store medians
        for col in numeric_cols:
            self.medians[col] = X[col].median()

        # Store clipping values
        for col in numeric_cols:
            low = X[col].quantile(0.01)
            high = X[col].quantile(0.99)
            self.clip_vals[col] = (low, high)
        
        return self

    def transform(self, X):
        X = pd.DataFrame(X, columns=numeric_cols)

        # Missing flags
        missing_flags = X.isna().astype(int)
        missing_flags.columns = [f"{col}_missing_flag" for col in missing_flags.columns]

        # Impute median
        for col in numeric_cols:
            X[col] = X[col].fillna(self.medians[col])

        # Clip outliers
        for col in numeric_cols:
            low, high = self.clip_vals[col]
            X[col] = X[col].clip(low, high)

        return pd.concat([X, missing_flags], axis=1)


In [7]:
# ==========================
# PREPROCESSING - CELL 6
# Categorical Preprocessor
# ==========================

class CategoricalPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.fill_values = {}
    
    def fit(self, X, y=None):
        X = pd.DataFrame(X, columns=categorical_cols)
        for col in categorical_cols:
            self.fill_values[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        X = pd.DataFrame(X, columns=categorical_cols)

        # missing flags
        missing_flags = X.isna().astype(int)
        missing_flags.columns = [f"{col}_missing_flag" for col in missing_flags.columns]

        # impute
        for col in categorical_cols:
            X[col] = X[col].fillna(self.fill_values[col])
        
        return pd.concat([X, missing_flags], axis=1)


In [8]:
# ==========================
# PREPROCESSING - CELL 7
# Build the FULL ColumnTransformer
# ==========================

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_pipeline = Pipeline([
    ("num_pre", NumericPreprocessor()),
    ("scale", RobustScaler())
])

categorical_pipeline = Pipeline([
    ("cat_pre", CategoricalPreprocessor()),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

# Fit on training set ONLY
preprocessor.fit(X_train)

print("Preprocessor fitted successfully!")


Preprocessor fitted successfully!


In [9]:
# ==========================
# PREPROCESSING - CELL 8
# Test preprocessing transformation
# ==========================

X_train_processed = preprocessor.transform(X_train)

print("Processed shape:", X_train_processed.shape)


Processed shape: (12000, 40)
