<a href="https://colab.research.google.com/github/DSGP-Group-1-EAPS/SL-Apparel-Dataset/blob/Kavindu-Deshanjana/EAPS_Data_Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from google.colab import drive

# Mounting the Google Drive
drive.mount('/content/drive')

# Reading the data from the excel file and removing blank spaces in each cell
df = pd.read_excel('/content/drive/MyDrive/DSGP_COURSEWORK/SL Apparel Dataset model/SL Apparel Industry Dataset.xlsx').apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Making the 'Absenteeism Type' considered as String
df['Absenteeism Type'] = df['Absenteeism Type'].astype(str)

# Removing rows with 'Resignation', 'VOP', 'Funeral' from the 'Reason' colunm
reasons_to_remove = ['Resignation', 'VOP', 'Funeral']
df = df[~df['Reason'].isin(reasons_to_remove)]

# Removing rows where 'Leave Type' is 0.5
df = df[df['Leave Type'] != 0.5]

# Removing all the rows except for 'Shift A' and 'Shift B' in the 'Shift' colunm
df = df[df['Shift'] != 'A']
df = df[df['Shift'] != 'Shift']
df = df[df['Shift'] != 'Other Teams']
df = df[df['Shift'] != 'Shift e']

# Removing 'Department' and 'Reason Mapping' columns
df = df.drop(['Department', 'Reason Mapping'], axis=1)

# Removing rows where 'Absent/Present' is 'Present'
df = df[df['Absent/Present'] != 'Present']

# Removing rows where 'Absenteeism Type' is 1
df = df[df['Absenteeism Type'] != '1']

# Removing the rows if it has a blank cell in it
df = df.dropna()

# Using LabelEncoder to encode the values in columns
label_encoder = LabelEncoder()
df['Encoded Reason'] = label_encoder.fit_transform(df['Reason'])
df['Encoded Absent/Present'] = label_encoder.fit_transform(df['Absent/Present'])
df['Encoded Status'] = label_encoder.fit_transform(df['Status'])
df['Encoded Absenteeism Type'] = label_encoder.fit_transform(df['Absenteeism Type'])
df['Encoded Shift'] = label_encoder.fit_transform(df['Shift'])

# Creating a mapping dictionary with the original values for clear understanding
reason_mapping = {
    'Reason': dict(zip(df['Encoded Reason'], df['Reason'])),
    'Absent_Present': dict(zip(df['Encoded Absent/Present'], df['Absent/Present'])),
    'Status': dict(zip(df['Encoded Status'], df['Status'])),
    'Absenteeism_Type': dict(zip(df['Encoded Absenteeism Type'], df['Absenteeism Type'])),
    'Shift': dict(zip(df['Encoded Shift'], df['Shift']))
}

# Saving the mapping dictionary to a text file
with open('/content/drive/MyDrive/DSGP_COURSEWORK/SL Apparel Dataset model/reason_mapping.txt', 'w') as f:
    for category, mappings in reason_mapping.items():
        f.write(f"{category}:\n")
        for encoded_value, original_value in mappings.items():
            f.write(f"  {encoded_value}: {original_value}\n")

# Save the new preprocessed data to a text file in google drive
df.to_excel('/content/drive/MyDrive/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data.xlsx', index=False)
print("Done")


Mounted at /content/drive
Done
