In [2]:
def feature_engineering_dataset(data):
    import pandas as pd
    from datetime import datetime, timedelta

    # Convert date columns to datetime
    date_cols = ['Start Date', 'Termination Date', 'Course Start Date', 'Course Completion Date', 'Timesheet Date']
    for col in date_cols:
        data[col] = pd.to_datetime(data[col], errors='coerce')

    # Time until retirement (assuming retirement age is 65)
    data['Years Until Retirement'] = 65 - data['Age']

    # Completed courses count
    data['Completed Courses Count'] = data.groupby('Employee ID')['Course Title'].transform('count')

    # Average Course Completion Rate
    data['Average Course Completion Rate'] = data.groupby('Employee ID')['Course Completion Rate'].transform('mean')

    # Time since last course completion
    data['Days Since Last Course'] = (datetime.now() - data['Course Completion Date']).dt.days

    # Average and total leave
    data['Average Leave Duration'] = data.groupby('Employee ID')['Duration of Leave'].transform('mean')
    data['Total Leave Taken'] = data.groupby('Employee ID')['Duration of Leave'].transform('sum')

    # Leave frequency in the last year
    last_year = datetime.now() - timedelta(days=365)
    data['Leave Frequency Last Year'] = data[data['Timesheet Date'] >= last_year].groupby('Employee ID')['Timesheet Date'].transform('count')

    # Average overtime and total time logged
    data['Average Overtime'] = data.groupby('Employee ID')['Overtime Logged'].transform('mean')
    data['Total Time Logged'] = data.groupby('Employee ID')['Time Logged'].transform('sum')

    data['Attrition'] = data['Termination Date'].apply(lambda x: 1 if pd.notna(x) else 0)

    return data

In [None]:
def data_preprocessing(data):
    import pandas as pd

    def value_encode(data_field):
        return data_field.apply(lambda field: float(sum(bytearray(str(field).encode('utf-8')))))
    
    data_copy = data.copy()
    data_copy = data_copy.fillna(0)
    data_copy = data_copy.drop_duplicates()
    
    print("Duplicated values dropped succesfully")
    print("*" * 100)

    data_copy.drop('Employee ID', inplace=True, axis=1)
    data_copy.drop('Start Date', inplace=True, axis=1)
    data_copy.drop('Termination Date', inplace=True, axis=1)
    data_copy.drop('Timesheet Date', inplace=True, axis=1)
    data_copy.drop('Course Completion Date', inplace=True, axis=1)    
    data_copy.drop('Type of Leave', inplace=True, axis=1)
    data_copy.drop('Time Since Last Leave', inplace=True, axis=1)
    data_copy.drop('Leave Frequency', inplace=True, axis=1)
    data_copy.drop('Type of Course', inplace=True, axis=1)
    data_copy.drop('Course Start Date', inplace=True, axis=1)
    data_copy.drop('Gender', inplace=True, axis=1)
    data_copy.drop('Time Logged', inplace=True, axis=1)
    data_copy.drop('Overtime Logged', inplace=True, axis=1)
    data_copy.drop('Manager ID', inplace=True, axis=1)
    data_copy.drop('Previous Manager ID', inplace=True, axis=1)
    data_copy.drop('Duration of Leave', inplace=True, axis=1)

    print("converting columns")
    print("*" * 100)
    
    object_columns = data_copy.dtypes[data_copy.dtypes == 'object'].index

    for col in object_columns:
        print(col)
        data_copy[col] = value_encode(data_copy[col])

    return data_copy