In [5]:
# Preprocessing — STEP 1: remove duplicates & drop identifier
import pandas as pd
pd.set_option('display.max_columns', None)

# load dataset (adjust path if needed)
df = pd.read_csv("../data/Faculty_Attrition_Dataset.csv")

print("Before cleaning:")
print("Shape:", df.shape)
print("Duplicate rows:", df.duplicated().sum())

# drop duplicates
df = df.drop_duplicates().reset_index(drop=True)

# drop identifier column 
if "faculty_id" in df.columns:
    df = df.drop(columns=["faculty_id"])

print("\nAfter removing duplicates and dropping faculty_id:")
print("Shape:", df.shape)
print("Duplicate rows (now):", df.duplicated().sum())

# showing top 5 rows 
display(df.head())


Before cleaning:
Shape: (15050, 17)
Duplicate rows: 50

After removing duplicates and dropping faculty_id:
Shape: (15000, 16)
Duplicate rows (now): 0


Unnamed: 0,academic_rank,tenure_status,years_at_institution,base_salary,teaching_load,research_funding,institution_type,department_size,admin_support,work_life_balance,department_collaboration,promotion_opportunities,publications_last_3_years,student_evaluation_avg,job_market_alternatives,left_institution
0,Associate Professor,Tenure-Track,18,117428.12,2,33793.7,Research University,Small,3.5,6.9,7.0,8.1,6,4.178,3.2,0
1,Lecturer,Tenured,4,,9,45723.79,Research University,Large,7.0,7.5,6.4,5.9,2,3.422,1.0,0
2,Full Professor,Tenured,25,155047.59,1,55494.33,Research University,Medium,3.8,6.5,5.6,5.4,10,4.056,2.2,0
3,Associate Professor,Tenured,10,96573.37,5,5637.4,Community College,Large,5.1,5.8,5.8,3.4,7,4.135,3.5,0
4,Lecturer,Non-Tenure,3,49735.05,7,2592.35,Liberal Arts College,Small,5.0,5.3,4.6,5.7,1,3.907,6.1,1


In [6]:
# Preprocessing — STEP 2: missing-value analysis, indicators, and imputation
from sklearn.impute import SimpleImputer

# 1) Missing counts before
print("Missing counts BEFORE:")
missing_before = df.isna().sum().sort_values(ascending=False)
display(missing_before[missing_before > 0])

# 2) Create missing-indicator columns for any column that has missing values
cols_with_missing = missing_before[missing_before > 0].index.tolist()
for col in cols_with_missing:
    df[f"{col}_missing_flag"] = df[col].isna().astype(int)

print(f"\nCreated missing indicator flags for: {cols_with_missing}")

# 3) Separate numeric and categorical (exclude target)
target_col = "left_institution"
if target_col in df.columns:
    features = df.drop(columns=[target_col])
else:
    features = df.copy()

num_cols = features.select_dtypes(include=['number']).columns.tolist()
# returns columns with 'include' data type
cat_cols = features.select_dtypes(include=['object', 'category']).columns.tolist()

# remove any newly created flag cols from numeric/categorical lists if mis-typed
num_cols = [c for c in num_cols if not c.endswith("_missing_flag")]
cat_cols = [c for c in cat_cols if not c.endswith("_missing_flag")]

print("\nNumeric columns detected:", num_cols)
print("Categorical columns detected:", cat_cols)

# 4) Impute numeric with median, categorical with mode (in-place)
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

if num_cols:
    df[num_cols] = num_imputer.fit_transform(df[num_cols])

if cat_cols:
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# 5) Verify no missing remain in feature columns (target should not be missing)
print("\nMissing counts AFTER imputation (features only):")
missing_after = df.drop(columns=[target_col]).isna().sum().sort_values(ascending=False)
display(missing_after[missing_after > 0])



Missing counts BEFORE:


work_life_balance         938
student_evaluation_avg    933
research_funding          901
base_salary               895
dtype: int64


Created missing indicator flags for: ['work_life_balance', 'student_evaluation_avg', 'research_funding', 'base_salary']

Numeric columns detected: ['years_at_institution', 'base_salary', 'teaching_load', 'research_funding', 'admin_support', 'work_life_balance', 'department_collaboration', 'promotion_opportunities', 'publications_last_3_years', 'student_evaluation_avg', 'job_market_alternatives']
Categorical columns detected: ['academic_rank', 'tenure_status', 'institution_type', 'department_size']

Missing counts AFTER imputation (features only):


Series([], dtype: int64)

In [7]:
# Preprocessing — STEP 3: outlier detection, flags, winsorization (1/99 clipping), and scaling
from sklearn.preprocessing import RobustScaler

target = 'left_institution'
num_cols = df.select_dtypes(include=['number']).columns.tolist()
# remove target and any flag cols from the numeric list if present
num_cols = [c for c in num_cols if c not in [target] and not c.endswith("_missing_flag")]

print("Numeric columns to process:", num_cols, "\n")

# 1) Show initial summary
print("Initial numeric summary (first 8 cols):")
display(df[num_cols].describe().T.iloc[:, :6])

# 2) IQR-based outlier detection counts
outlier_counts = {}
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    mask = (df[col] < lower) | (df[col] > upper)
    outlier_counts[col] = int(mask.sum())
    
outlier_counts_sorted = dict(sorted(outlier_counts.items(), key=lambda x: x[1], reverse=True))
print("Outlier counts (IQR rule) per numeric column:")
for k, v in outlier_counts_sorted.items():
    print(f"  {k}: {v}")

# 3) Create outlier flags for columns that have >0 outliers
outlier_flag_cols = []
for col, cnt in outlier_counts.items():
    if cnt > 0:
        flag_col = f"{col}_outlier_flag"
        df[flag_col] = ((df[col] < (df[col].quantile(0.25) - 1.5*(df[col].quantile(0.75)-df[col].quantile(0.25)))) |
                        (df[col] > (df[col].quantile(0.75) + 1.5*(df[col].quantile(0.75)-df[col].quantile(0.25))))).astype(int)
        outlier_flag_cols.append(flag_col)

print("\nCreated outlier flag columns for:", outlier_flag_cols)

# 4) Winsorize / cap values at 1st and 99th percentiles (adjustable)
lower_p = 0.01
upper_p = 0.99
caps = {}
for col in num_cols:
    low = df[col].quantile(lower_p)
    high = df[col].quantile(upper_p)
    caps[col] = (low, high)
    df[col] = df[col].clip(lower=low, upper=high)

print("\nApplied clipping at 1st and 99th percentiles for numeric columns.")

# 5) Scaling with RobustScaler
scaler = RobustScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("\nNumeric summary after clipping + Robust scaling (first 8 cols):")
display(df[num_cols].describe().T.iloc[:, :6])

# 6) verify outlier flags still meaningful
if outlier_flag_cols:
    print("\nSample rows where outlier flags are 1 (first 10):")
    display(df[df[outlier_flag_cols].any(axis=1)].head(10))
else:
    print("\nNo outlier flags created (no IQR outliers).")

# 7) Save list of numeric columns, flags, and scaler for later pipeline use
print("\nSaved lists for pipeline:")
print("  numeric_features:", num_cols)
print("  outlier_flag_columns:", outlier_flag_cols)


Numeric columns to process: ['years_at_institution', 'base_salary', 'teaching_load', 'research_funding', 'admin_support', 'work_life_balance', 'department_collaboration', 'promotion_opportunities', 'publications_last_3_years', 'student_evaluation_avg', 'job_market_alternatives'] 

Initial numeric summary (first 8 cols):


Unnamed: 0,count,mean,std,min,25%,50%
years_at_institution,15000.0,9.807667,6.67638,0.0,5.0,8.0
base_salary,15000.0,88129.336899,30601.001873,14498.580132,66565.3125,82789.7
teaching_load,15000.0,4.435533,2.711642,0.0,2.0,4.0
research_funding,15000.0,22466.102224,27554.471576,0.0,5882.3225,12470.71
admin_support,15000.0,6.00076,1.502356,1.0,5.0,6.0
work_life_balance,15000.0,6.0047,1.71766,1.0,4.9,6.0
department_collaboration,15000.0,5.999293,1.499626,1.0,5.0,6.0
promotion_opportunities,15000.0,5.497807,1.961775,1.0,4.1,5.5
publications_last_3_years,15000.0,3.259133,2.508041,0.0,1.0,3.0
student_evaluation_avg,15000.0,3.924373,0.320363,2.499,3.719,3.923


Outlier counts (IQR rule) per numeric column:
  research_funding: 824
  years_at_institution: 499
  student_evaluation_avg: 183
  base_salary: 117
  work_life_balance: 116
  publications_last_3_years: 71
  department_collaboration: 55
  admin_support: 48
  teaching_load: 0
  promotion_opportunities: 0
  job_market_alternatives: 0

Created outlier flag columns for: ['years_at_institution_outlier_flag', 'base_salary_outlier_flag', 'research_funding_outlier_flag', 'admin_support_outlier_flag', 'work_life_balance_outlier_flag', 'department_collaboration_outlier_flag', 'publications_last_3_years_outlier_flag', 'student_evaluation_avg_outlier_flag']

Applied clipping at 1st and 99th percentiles for numeric columns.

Numeric summary after clipping + Robust scaling (first 8 cols):


Unnamed: 0,count,mean,std,min,25%,50%
years_at_institution,15000.0,0.223467,0.826199,-1.0,-0.375,0.0
base_salary,15000.0,0.124868,0.719768,-1.109425,-0.413473,0.0
teaching_load,15000.0,0.108883,0.67791,-1.0,-0.5,0.0
research_funding,15000.0,0.380111,0.915716,-0.508032,-0.268398,0.0
admin_support,15000.0,0.001207,0.741215,-1.75,-0.5,0.0
work_life_balance,15000.0,0.004148,0.775436,-1.909091,-0.5,0.0
department_collaboration,15000.0,0.000605,0.739422,-1.7505,-0.5,0.0
promotion_opportunities,15000.0,-0.000812,0.726583,-1.666667,-0.518519,0.0
publications_last_3_years,15000.0,0.0604,0.612823,-0.75,-0.5,0.0
student_evaluation_avg,15000.0,0.003484,0.763376,-1.881092,-0.495146,0.0



Sample rows where outlier flags are 1 (first 10):


Unnamed: 0,academic_rank,tenure_status,years_at_institution,base_salary,teaching_load,research_funding,institution_type,department_size,admin_support,work_life_balance,department_collaboration,promotion_opportunities,publications_last_3_years,student_evaluation_avg,job_market_alternatives,left_institution,work_life_balance_missing_flag,student_evaluation_avg_missing_flag,research_funding_missing_flag,base_salary_missing_flag,years_at_institution_outlier_flag,base_salary_outlier_flag,research_funding_outlier_flag,admin_support_outlier_flag,work_life_balance_outlier_flag,department_collaboration_outlier_flag,publications_last_3_years_outlier_flag,student_evaluation_avg_outlier_flag
36,Full Professor,Tenured,2.25,1.964577,-0.75,2.096292,Research University,Large,-1.0,-0.318182,-0.4,0.888889,1.5,0.300971,-1.0,0,0,0,0,0,1,1,0,0,0,0,0,0
41,Full Professor,Tenured,2.5,1.575267,-0.5,1.66354,Research University,Medium,-1.05,-0.545455,-1.2,0.0,0.5,0.473301,1.714286,0,0,0,0,0,1,0,0,0,0,0,0,0
50,Associate Professor,Tenure-Track,0.875,0.0,-0.75,3.155891,Research University,Small,-0.3,0.454545,1.65,-0.037037,0.5,0.019417,-0.428571,0,0,0,0,1,0,0,1,0,0,0,0,0
56,Full Professor,Non-Tenure,2.125,-0.659365,-0.25,3.853506,Research University,Medium,-0.7,-0.227273,1.05,0.851852,0.0,0.029126,0.035714,0,0,0,0,0,0,0,1,0,0,0,0,0
72,Assistant Professor,Tenure-Track,-0.625,0.030915,-0.75,2.482802,Research University,Small,1.5,0.545455,-1.15,0.111111,0.5,0.004854,-1.178571,0,0,0,0,0,0,0,1,0,0,0,0,0
77,Full Professor,Tenured,2.375,1.693708,0.0,0.0,Research University,Small,-0.1,0.545455,0.8,1.666667,0.5,0.276699,-0.142857,0,0,0,1,0,1,0,0,0,0,0,0,0
95,Lecturer,Non-Tenure,-0.375,-0.759786,0.5,-0.411844,Liberal Arts College,Medium,0.1,-1.727273,-1.7505,-0.074074,-0.25,-0.48301,-0.785714,0,0,0,0,0,0,0,0,0,0,1,0,0
139,Full Professor,Tenured,2.625,1.237682,0.0,0.111032,Technical Institute,Small,-0.3,0.0,-0.05,-1.222222,0.25,-0.558252,0.071429,0,1,0,0,0,1,0,0,0,0,0,0,0
148,Full Professor,Tenured,2.25,1.390242,0.25,-0.016479,Liberal Arts College,Medium,-0.95,-0.409091,-0.1,-0.62963,1.25,-0.788835,0.285714,0,0,0,0,0,1,0,0,0,0,0,0,0
154,Assistant Professor,Tenure-Track,0.0,-0.221615,-0.25,3.886999,Research University,Medium,-0.2,1.045455,0.95,0.740741,-0.25,0.0,1.25,0,0,1,0,0,0,0,1,0,0,0,0,0



Saved lists for pipeline:
  numeric_features: ['years_at_institution', 'base_salary', 'teaching_load', 'research_funding', 'admin_support', 'work_life_balance', 'department_collaboration', 'promotion_opportunities', 'publications_last_3_years', 'student_evaluation_avg', 'job_market_alternatives']
  outlier_flag_columns: ['years_at_institution_outlier_flag', 'base_salary_outlier_flag', 'research_funding_outlier_flag', 'admin_support_outlier_flag', 'work_life_balance_outlier_flag', 'department_collaboration_outlier_flag', 'publications_last_3_years_outlier_flag', 'student_evaluation_avg_outlier_flag']


In [8]:
# Preprocessing — STEP 4: Identify and prepare categorical columns for OneHotEncoding
from sklearn.preprocessing import OneHotEncoder

target = "left_institution"

# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

print("Categorical columns detected:")
print(categorical_cols, "\n")

# check the unique values for each categorical column
for col in categorical_cols:
    print(f"Column: {col}")
    print(" Unique values:", df[col].unique())
    print("-" * 50)

# These columns will be OneHotEncoded inside ColumnTransformer (next step)
print("\nThese columns will be OneHotEncoded as part of the pipeline.")

# Let's also gather numeric columns again (excluding target, missing flags, outlier flags)
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
numeric_cols = [c for c in numeric_cols if not c.endswith("_missing_flag") 
                                         and not c.endswith("_outlier_flag")
                                         and c != target]

missing_flag_cols = [c for c in df.columns if c.endswith("_missing_flag")]
outlier_flag_cols = [c for c in df.columns if c.endswith("_outlier_flag")]

print("\nFinal feature groups:")
print(" Numeric columns:", numeric_cols)
print(" Missing-flag columns:", missing_flag_cols)
print(" Outlier-flag columns:", outlier_flag_cols)
print(" Categorical columns:", categorical_cols)


Categorical columns detected:
['academic_rank', 'tenure_status', 'institution_type', 'department_size'] 

Column: academic_rank
 Unique values: ['Associate Professor' 'Lecturer' 'Full Professor' 'Assistant Professor']
--------------------------------------------------
Column: tenure_status
 Unique values: ['Tenure-Track' 'Tenured' 'Non-Tenure']
--------------------------------------------------
Column: institution_type
 Unique values: ['Research University' 'Community College' 'Liberal Arts College'
 'Technical Institute']
--------------------------------------------------
Column: department_size
 Unique values: ['Small' 'Large' 'Medium']
--------------------------------------------------

These columns will be OneHotEncoded as part of the pipeline.

Final feature groups:
 Numeric columns: ['years_at_institution', 'base_salary', 'teaching_load', 'research_funding', 'admin_support', 'work_life_balance', 'department_collaboration', 'promotion_opportunities', 'publications_last_3_years', 

In [9]:
# STEP 5 — Train/Test Split + Build Preprocessor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
target = "left_institution"

X = df.drop(columns=[target])
y = df[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

# Train-test split 

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("\nTraining set:", X_train.shape)
print("Test set:", X_test.shape)
print("Target distribution (train):")
print(y_train.value_counts(normalize=True))
print("Target distribution (test):")
print(y_test.value_counts(normalize=True))


categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X_train.select_dtypes(include=["number"]).columns.tolist()

# Remove flags from numeric
missing_flag_cols = [c for c in X_train.columns if c.endswith("_missing_flag")]
outlier_flag_cols = [c for c in X_train.columns if c.endswith("_outlier_flag")]

numeric_cols = [c for c in numeric_cols 
                if not c.endswith("_missing_flag") 
                and not c.endswith("_outlier_flag")]

print("\nCategorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)
print("Missing-flag columns:", missing_flag_cols)
print("Outlier-flag columns:", outlier_flag_cols)


numeric_pipeline = Pipeline([
    ("scaler", RobustScaler())
])

categorical_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols),
    ("flags", "passthrough", missing_flag_cols + outlier_flag_cols)
])

preprocessor.fit(X_train)

print("\nPreprocessor fitted on training data.")


X shape: (15000, 27)
y shape: (15000,)

Training set: (12000, 27)
Test set: (3000, 27)
Target distribution (train):
left_institution
0    0.867917
1    0.132083
Name: proportion, dtype: float64
Target distribution (test):
left_institution
0    0.868
1    0.132
Name: proportion, dtype: float64

Categorical columns: ['academic_rank', 'tenure_status', 'institution_type', 'department_size']
Numeric columns: ['years_at_institution', 'base_salary', 'teaching_load', 'research_funding', 'admin_support', 'work_life_balance', 'department_collaboration', 'promotion_opportunities', 'publications_last_3_years', 'student_evaluation_avg', 'job_market_alternatives']
Missing-flag columns: ['work_life_balance_missing_flag', 'student_evaluation_avg_missing_flag', 'research_funding_missing_flag', 'base_salary_missing_flag']
Outlier-flag columns: ['years_at_institution_outlier_flag', 'base_salary_outlier_flag', 'research_funding_outlier_flag', 'admin_support_outlier_flag', 'work_life_balance_outlier_flag',