In [1]:
import pandas as pd
import numpy as np

In [2]:
TRAINING = pd.read_csv("train.csv")
TESTING = pd.read_csv("test.csv") # Without Label
AUGMENTED = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv") # Additional data to augment training

In [3]:
# Drop ID columns
TRAINING = TRAINING.drop(columns=['id'])

# Drop duplplicates if it exists
TRAINING = TRAINING.drop_duplicates(subset=None, keep='first', inplace=False)
AUGMENTED = AUGMENTED.drop_duplicates(subset=None, keep='first', inplace=False)

# Make Attrition for Augmented be 0 and 1
AUGMENTED['Attrition'] = AUGMENTED['Attrition'].replace({'Yes':1,'No':0})

In [4]:
def dropOneDim(data: pd.DataFrame, Label: str) -> (pd.DataFrame, list):
    one_dimensional_cols = [col for col in data.columns if data[col].nunique() == 1]
    print("Dropped columns:", one_dimensional_cols)  # Print dropped columns

    # Filter out one-dimensional columns
    remaining_columns = [col for col in data.columns if col not in one_dimensional_cols]
    
    # Exclude the label column from remaining columns
    remaining_columns = [col for col in remaining_columns if col != Label]

    # Return updated DataFrame and remaining columns
    return data[remaining_columns + [Label]], remaining_columns

In [5]:
# Drop one-dimensional features
TRAINING, remaining_columns = dropOneDim(TRAINING, "Attrition")
TESTING = TESTING[remaining_columns]
AUGMENTED, remaining_columns = dropOneDim(AUGMENTED, "Attrition")

Dropped columns: ['EmployeeCount', 'Over18', 'StandardHours']
Dropped columns: ['EmployeeCount', 'Over18', 'StandardHours']


In [6]:
ORDER = ["Non-Travel", "Travel_Rarely", "Travel_Frequently"]
TRAINING["BusinessTravel"] = pd.Categorical(TRAINING["BusinessTravel"], categories=ORDER, ordered=True)
AUGMENTED["BusinessTravel"] = pd.Categorical(AUGMENTED["BusinessTravel"], categories=ORDER, ordered=True)
TESTING["BusinessTravel"] = pd.Categorical(TESTING["BusinessTravel"], categories=ORDER, ordered=True)
# Encode as integers
TRAINING["BusinessTravel"] = TRAINING["BusinessTravel"].cat.codes.replace(-1, None)
AUGMENTED["BusinessTravel"] = AUGMENTED["BusinessTravel"].cat.codes.replace(-1, None)
TESTING["BusinessTravel"] = TESTING["BusinessTravel"].cat.codes.replace(-1, None)


In [7]:
NUMERICS = TRAINING.select_dtypes(include=["number"]).columns.tolist()
CATEGORICALS = TRAINING.select_dtypes(include=["object", "category"]).columns.tolist()
print (f'Categorical features: {CATEGORICALS}')
print (f'Numeric Features: {NUMERICS}')

Categorical features: ['Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
Numeric Features: ['Age', 'BusinessTravel', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition']


In [8]:
def impute_median(df, exclude_columns=[]):
    numeric_features = df.select_dtypes(include=['number']).columns
    numeric_features = [col for col in numeric_features if col not in exclude_columns]
    for column in numeric_features:
        median_value = df[column].median()
        df[column].fillna(median_value, inplace=True)

# Apply on df
impute_median(TRAINING, exclude_columns=['Attrition'])
impute_median(TESTING)
impute_median(AUGMENTED, exclude_columns=['Attrition'])

In [9]:
def cap_outliers(df, exclude_columns=[], lower_quantile=0.01, upper_quantile=0.99):
    numeric_features = df.select_dtypes(include=['number']).columns
    numeric_features = [col for col in numeric_features if col not in exclude_columns]
    for column in numeric_features:
        lower_bound = df[column].quantile(lower_quantile)
        upper_bound = df[column].quantile(upper_quantile)
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)

# Apply on DF
cap_outliers(TRAINING, exclude_columns=['Label'])
cap_outliers(TESTING)
cap_outliers(AUGMENTED, exclude_columns=['Label'])

In [10]:
# Dropping Employee number from Augmented dataframe
AUGMENTED = AUGMENTED.drop(columns=['EmployeeNumber'])

In [11]:
# Derive new features
def feature_extraction(df: pd.DataFrame) -> pd.DataFrame:
    # df['AgeGroup'] = pd.cut(df['Age'], bins=[20, 30, 40, 50], labels=['20-30', '31-40', '41-50'])
    df['Tenure'] = df['Age'] - df['YearsAtCompany']
    df['IncomePerYear'] = df['MonthlyIncome'] / df['YearsAtCompany']
    df['JobSatisfactionRatio'] = df['JobSatisfaction'] / df['YearsAtCompany']
    # df['DistanceCategory'] = pd.cut(df['DistanceFromHome'], bins=[0, 10, 20, 30], labels=['0-10 km', '11-20 km', '21-30 km'])
    df['TotalTrainingTime'] = df['TrainingTimesLastYear'] * df['YearsAtCompany']
    df['AverageMonthlyRate'] = df['MonthlyRate'] / df['YearsAtCompany']
    df['JobRoleTenure'] = df['YearsInCurrentRole'] / df['YearsAtCompany']
    df['PromotionFrequency'] = df['YearsSinceLastPromotion'] / df['YearsAtCompany']

    return df

In [12]:
# Apply the function on dataset
NEW_TRAINING = feature_extraction(TRAINING)
NEW_TESTING = feature_extraction(TESTING)
NEW_AUGMENTED = feature_extraction(AUGMENTED)

In [13]:
# Perform one-hot encoding on categorical features
NEW_TRAINING = pd.get_dummies(NEW_TRAINING, columns=CATEGORICALS)
NEW_TESTING = pd.get_dummies(NEW_TESTING, columns=CATEGORICALS)
NEW_AUGMENTED = pd.get_dummies(NEW_AUGMENTED, columns=CATEGORICALS)

In [14]:
NEW_TRAINING = NEW_TRAINING.dropna()
NEW_AUGMENTED = NEW_AUGMENTED.dropna()

In [42]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, mean_squared_error
import tensorflow as ts
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers  import Adam, SGD
from scikeras.wrappers import KerasClassifier

In [38]:
def create_model(optimizer='adam', activation='relu', hidden_units=64):
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    model.add(Dense(hidden_units, activation=activation))
    model.add(Dense(hidden_units, activation=activation))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['AUC'])
    return model



In [39]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'optimizer': ['adam', 'sgd'],
    'model__activation': ['relu', 'tanh'],
    # 'dropout_rate': [0.2, 0.3],
    'model__hidden_units': [32, 64, 128]
}

In [43]:
# Initialize classifier
model = KerasClassifier(model= create_model, verbose = 0)

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store results
results = []

In [44]:
# Perform K-fold stratified sampling and augment the data
for fold, (train_index, test_index) in enumerate(skf.split(NEW_TRAINING, NEW_TRAINING['Attrition'])):
    print(f"Processing fold {fold + 1}...")
    # Split the training data into train and validation sets
    train_fold = NEW_TRAINING.iloc[train_index]
    val_fold = NEW_TRAINING.iloc[test_index]
    
    # Augment the training data with the augmentation data
    train_fold_augmented = pd.concat([train_fold, NEW_AUGMENTED], ignore_index=True)
    
    X_train = train_fold_augmented.drop('Attrition', axis=1)
    y_train = train_fold_augmented['Attrition']
    X_val = val_fold.drop('Attrition', axis=1)
    y_val = val_fold['Attrition']
    
       
    # Perform grid search
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', cv=skf)
    grid_result = grid.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_result.best_estimator_
    
    # Evaluate the best model on the validation set
    y_pred_val = best_model.predict(X_val)
    y_pred_proba_val = best_model.predict_proba(X_val)[:, 1]
    
    auc_val = roc_auc_score(y_val, y_pred_proba_val)
    f1_val = f1_score(y_val, y_pred_val)
    accuracy_val = accuracy_score(y_val, y_pred_val)
    mse_val = mean_squared_error(y_val, y_pred_val)
    
    results.append({
        'Fold': fold + 1,
        'Model': 'Neural Network',
        'Best Hyperparameters': grid_result.best_params_,
        'AUC': auc_val,
        'F1': f1_val,
        'Accuracy': accuracy_val,
        'MSE': mse_val
    })

Processing fold 1...
Processing fold 2...
Processing fold 3...
Processing fold 4...
Processing fold 5...


In [45]:
# Create a DataFrame to display the results
results_df = pd.DataFrame(results)
print(results_df)

   Fold           Model                               Best Hyperparameters  \
0     1  Neural Network  {'model__activation': 'tanh', 'model__hidden_u...   
1     2  Neural Network  {'model__activation': 'tanh', 'model__hidden_u...   
2     3  Neural Network  {'model__activation': 'tanh', 'model__hidden_u...   
3     4  Neural Network  {'model__activation': 'tanh', 'model__hidden_u...   
4     5  Neural Network  {'model__activation': 'tanh', 'model__hidden_u...   

        AUC   F1  Accuracy       MSE  
0  0.682286  0.0  0.889231  0.110769  
1  0.602172  0.0  0.889231  0.110769  
2  0.565785  0.0  0.886154  0.113846  
3  0.698929  0.0  0.888889  0.111111  
4  0.644097  0.0  0.888889  0.111111  


In [None]:
# Retrain the model on the entire training data using the best hyperparameters from the best fold
best_params = results_df.loc[results_df['AUC'].idxmax(), 'Best Hyperparameters']
full_train_data = pd.concat([NEW_TRAINING, NEW_AUGMENTED], ignore_index=True)
X_full_train = full_train_data.drop('Attrition', axis=1)
y_full_train = full_train_data['Attrition']

In [None]:
# Create and train the final model
final_model = create_model(learning_rate=best_params['learning_rate'], dropout_rate=best_params['dropout_rate'])
final_model.fit(X_full_train, y_full_train, epochs=best_params['epochs'], batch_size=best_params['batch_size'], verbose=0)


In [None]:
# Evaluate on the test set
y_pred_test = final_model.predict(X_test)
y_pred_proba_test = final_model.predict_proba(X_test)

In [None]:
test = pd.read_csv("test.csv")

# Combine the id column with the predicted probabilities and save as CSV
id_column = test['id']

predictions_df = pd.DataFrame({
    'id': id_column,
    'predicted_probability': y_pred_proba_test
})

predictions_df.to_csv('neural_network_predictions.csv', index=False)
print("Predictions saved to neural_network_predictions.csv")