<a href="https://colab.research.google.com/github/DSGP-Group-1-EAPS/SL-Apparel-Dataset/blob/main/RandomForestClassificationUpdated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd
!pip install scikit-learn==1.4.1.post1

Collecting scikit-learn==1.4.1.post1
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.4.1.post1


##Loading the data

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data from Excel file
file_path = '/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Dataset/preprocessed_data_new.xlsx'
data = pd.read_excel(file_path)

# Display the loaded data
print(data.head())

Mounted at /content/drive
        Date    Shift   Code          Department Absenteeism Type    Status  \
0 2021-12-01  Shift A  AA369  Team - MAT 4A - BD         Informed  Notified   
1 2021-12-01  Shift A  AA362  Team - MAT 4A - BD         Informed  Notified   
2 2021-12-01  Shift A  AA359  Team - MAT 4A - BD         Informed  Notified   
3 2021-12-01  Shift A  AA541  Team - MAT 4A - BD         Informed  Notified   
4 2021-12-01  Shift A  AA398  Team - MAT 3A - BD         Informed  Notified   

   Leave Type Absent/Present     Reason Joined Date  ...  LeaveMonth  \
0           1            MAT  Maternity  2015-09-11  ...          12   
1           1            MAT  Maternity  2018-11-14  ...          12   
2           1            MAT  Maternity  2019-10-09  ...          12   
3           1            MAT  Maternity  2020-10-02  ...          12   
4           1            MAT  Maternity  2008-02-29  ...          12   

   Encoded Reason  Encoded Status  Encoded Absenteeism Type  Encod

In [4]:
leave_days_by_month = data.groupby(['LeaveYear', 'LeaveMonth']).size().reset_index(name='TotalLeaveDays')
print(leave_days_by_month)

monthly_totals = {}

for index, row in leave_days_by_month.iterrows():
    year = row['LeaveYear']
    month = row['LeaveMonth']
    total_leave_days = row['TotalLeaveDays']
    monthly_totals[(year, month)] = total_leave_days

# Create a new column 'MonthlyTotal' by mapping the (year, month) tuple to its corresponding total leave count
data['MonthlyTotal'] = data.apply(lambda row: monthly_totals.get((row['LeaveYear'], row['LeaveMonth']), 0), axis=1)

# Display the updated DataFrame
print(data)


    LeaveYear  LeaveMonth  TotalLeaveDays
0        2021          12            3184
1        2022           1            2181
2        2022           2            2353
3        2022           3            2471
4        2022           4            1770
5        2022           5            3351
6        2022           6            2834
7        2022           7            1580
8        2022           8            1724
9        2022           9            1575
10       2022          10            1905
11       2022          11            1664
12       2022          12            1155
13       2023           1            1104
14       2023           2             412
15       2023           3            1264
16       2023           4             469
17       2023           5            1758
18       2023           6            1260
19       2023           7            1098
20       2023           8            1082
21       2023           9            1030
22       2023          10         

In [5]:
def onehot_encode(df, column, prefix):
    df = df.copy()

    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)

    return df

In [8]:
from sklearn.model_selection import train_test_split
def preprocess_inputs(df):
    df = df.copy()
    rf_selected_features = ['Date','DaysWorked', 'DayOfWeek', 'Encoded Code', 'LeaveMonth', 'Encoded Status',
                        'Encoded Absenteeism Type', 'Encoded Shift', 'LeaveYear', 'NumOfLeaveDays', 'MonthlyTotal',
                        'Reason_0', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Reason_5',
                        'Reason_6', 'Reason_7', 'Reason_8', 'Reason_9', 'Reason_10', 'Reason_11',
                        'Reason_12', 'Reason_13', 'Reason_14', 'Reason_15', 'Reason_16', 'Reason_17',
                        'Reason_18', 'TargetCategory']

    # One-hot encode the reason column
    df = onehot_encode(
        df,
        column='Encoded Reason',
        prefix='Reason'
    )

    df = df[rf_selected_features]

    # Filter the data for November 2023
    testing_data_november = df[(df['LeaveYear'] == 2023) & (df['LeaveMonth'] >= 11)]
    print("Nov", testing_data_november.shape)
    df = df[df['Date'] < '2023-11-01']
    testing_data_october = df[(df['LeaveYear'] == 2023) & (df['LeaveMonth'] >= 10)]
    print("Oct", testing_data_october.shape)
    df = df[df['Date'] < '2023-10-01']
    testing_data_september = df[(df['LeaveYear'] == 2023) & (df['LeaveMonth'] >= 9)]
    print("Sept", testing_data_september.shape)



    testing_data = df[df['Date'] > '2023-09-01']
    df = df[df['Date'] < '2023-10-01']
    df = df.drop('Date', axis=1)
    testing_data = testing_data.drop('Date', axis=1)
    testing_data_october_copy = testing_data_october.copy()
    testing_data_october_copy = testing_data_october_copy.drop('Date', axis=1)



    # Split df into X and y
    Y_train = df['TargetCategory'].copy()
    X_train = df.drop('TargetCategory', axis=1)
    X_test = testing_data_october_copy.drop('TargetCategory', axis=1)
    Y_test = testing_data_october_copy['TargetCategory'].copy()
    print(X_test['LeaveMonth'])

    return testing_data_september, testing_data_october, testing_data_november, X_train, X_test, Y_train, Y_test

In [9]:
# Split the remaining data into training and testing sets
testing_data_september, testing_data_october, testing_data_november, X_train, X_test, y_train, y_test = preprocess_inputs(data)


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_test)
testing_data_september.to_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Dataset/testing_data_september.xlsx', index=False)
testing_data_october.to_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Dataset/testing_data_october.xlsx', index=False)
testing_data_november.to_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Dataset/testing_data_november.xlsx', index=False)

Nov (2273, 31)
Oct (1169, 31)
Sept (1030, 31)
37190    10
37191    10
37192    10
37193    10
37194    10
         ..
39175    10
39176    10
39177    10
39178    10
39179    10
Name: LeaveMonth, Length: 1169, dtype: int64
(37224, 29)
(37224,)
(1169, 29)
(1169,)
       DaysWorked  DayOfWeek  Encoded Code  LeaveMonth  Encoded Status  \
37190       10146          1           717          10               1   
37191        7831          1          3420          10               1   
37192       10016          1          1585          10               1   
37193        7780          1          3155          10               1   
37194        9947          1          1374          10               1   
...           ...        ...           ...         ...             ...   
39175        2718          5          1506          10               1   
39176         968          5           275          10               1   
39177         738          5          2989          10               1 