<a href="https://colab.research.google.com/github/DSGP-Group-1-EAPS/SL-Apparel-Dataset/blob/Kavindu-Deshanjana/EAPS_Data_Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Step 1: Read the Data
df = pd.read_excel('/content/drive/MyDrive/DSGP_COURSEWORK/SL Apparel Dataset model/SL Apparel Industry Dataset.xlsx')

df['Absenteeism Type'] = df['Absenteeism Type'].astype(str)

# Step 2: Filter Rows Based on Reasons
reasons_to_remove = ['Resignation', 'VOP', 'Funeral ', 'Funeral']
df = df[~df['Reason'].isin(reasons_to_remove)]

# Step 3: Remove Rows with Leave Type 0.5
df = df[df['Leave Type'] != 0.5]

# Step 4: Remove Department and Reason Mapping Columns
df = df.drop(['Department', 'Reason Mapping'], axis=1)

# Step 5: Remove Rows with 'Present' in Absent/Present Column
df = df[df['Absent/Present'] != 'Present']
df = df[df['Absent/Present'] != '']

# Step 5: Remove Rows with '1' and blanks in Absenteeism Type Column
df = df[df['Absenteeism Type'] != '1']
df = df[df['Absenteeism Type'] != '']

# Step 6: Label Encode 'Reason Mapping' Column
print(df.dtypes)

label_encoder = LabelEncoder()
df['Encoded_Reason'] = label_encoder.fit_transform(df['Reason'])
df['Encoded_Absent_Present'] = label_encoder.fit_transform(df['Absent/Present'])
df['Encoded_Status'] = label_encoder.fit_transform(df['Status'])
df['Encoded_Absenteeism_Type'] = label_encoder.fit_transform(df['Absenteeism Type'])
df['Encoded_Shift'] = label_encoder.fit_transform(df['Shift'])
print("Column names:", df.columns)

# Step 7: Create a Mapping Dictionary for Encoded Values and Original Reasons
reason_mapping = {
    'Reason': dict(zip(df['Encoded_Reason'], df['Reason'])),
    'Absent_Present': dict(zip(df['Encoded_Absent_Present'], df['Absent/Present'])),
    'Status': dict(zip(df['Encoded_Status'], df['Status'])),
    'Absenteeism_Type': dict(zip(df['Encoded_Absenteeism_Type'], df['Absenteeism Type'])),
    'Shift': dict(zip(df['Encoded_Shift'], df['Shift']))
}

# Step 8: Save the Mapping Dictionary to a Text File
with open('/content/drive/MyDrive/DSGP_COURSEWORK/SL Apparel Dataset model/reason_mapping.txt', 'w') as f:
    for category, mappings in reason_mapping.items():
        f.write(f"{category}:\n")
        for encoded_value, original_value in mappings.items():
            f.write(f"  {encoded_value}: {original_value}\n")

# Step 9: Save the Processed Data to a New Excel File
df.to_excel('/content/drive/MyDrive/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data.xlsx', index=False)
print("Done")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Date                datetime64[ns]
Shift                       object
Code                        object
Absenteeism Type            object
Status                      object
Leave Type                 float64
Absent/Present              object
Reason                      object
dtype: object
Column names: Index(['Date', 'Shift', 'Code', 'Absenteeism Type', 'Status', 'Leave Type',
       'Absent/Present', 'Reason', 'Encoded_Reason', 'Encoded_Absent_Present',
       'Encoded_Status', 'Encoded_Absenteeism_Type', 'Encoded_Shift'],
      dtype='object')
Done
