<a href="https://colab.research.google.com/github/DSGP-Group-1-EAPS/SL-Apparel-Dataset/blob/Ranidu-Gurusinghe/Binary_Classification_Model_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Preprocessing

In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

# Mounting the Google Drive
drive.mount('/content/drive')

# Reading the data from the excel file and removing blank spaces in each cell
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/SL Apparel Industry Dataset.xlsx').apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Reading the JoinDates.xlsx file
join_dates_df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/JoinedDates.xlsx')

# Merging the SL Apparel Industry Dataset with the Join Dates based on the 'Code' column
df = pd.merge(df, join_dates_df, on='Code', how='left')

df['Joined Date'] = pd.to_datetime(df['Joined Date'])
# Calculating the number of years each employee has worked
df['DaysWorked'] = (df['Date'] - df['Joined Date']).dt.days

# Making the 'Absenteeism Type' considered as String
df['Absenteeism Type'] = df['Absenteeism Type'].astype(str)

# Add a column for the day of the week (1-7)
df['DayOfWeek'] = df['Date'].dt.dayofweek + 1
# Convert 'Code' column to numeric by removing 'AA' prefix
df['Encoded Code'] = pd.to_numeric(df['Code'].str.replace('AA', ''), errors='coerce')
# Add a column for the month (1-12)
df['LeaveMonth'] = df['Date'].dt.month
# Removing rows with 'Resignation', 'VOP', 'Funeral' from the 'Reason' column
reasons_to_remove = ['Resignation', 'VOP', 'Funeral']
df = df[~df['Reason'].isin(reasons_to_remove)]

# Removing rows where 'Leave Type' is 0.5
df = df[df['Leave Type'] != 0.5]

# Removing all the rows except for 'Shift A' and 'Shift B' in the 'Shift' column
df = df[df['Shift'] != 'A']
df = df[df['Shift'] != 'Shift']
df = df[df['Shift'] != 'Other Teams']
df = df[df['Shift'] != 'Shift e']

# Removing 'Department' and 'Reason Mapping' columns
df = df.drop(['Reason Mapping'], axis=1)

# Removing rows where 'Absenteeism Type' is 1
df = df[df['Absenteeism Type'] != '1']

# Using LabelEncoder to encode the values in columns
label_encoder = LabelEncoder()
df['Encoded Reason'] = label_encoder.fit_transform(df['Reason'])
df['Encoded Status'] = label_encoder.fit_transform(df['Status'])
df['Encoded Absenteeism Type'] = label_encoder.fit_transform(df['Absenteeism Type'])
df['Encoded Shift'] = label_encoder.fit_transform(df['Shift'])

# Create a dictionary to store leave dates for each employee
employee_leave_dates = {}

# Iterate through the DataFrame to populate the dictionary
for index, row in df.iterrows():
    employee_code = row['Code']
    leave_date = row['Date']

    # Check if the employee code is already in the dictionary
    if employee_code in employee_leave_dates:
        # Append the leave date to the existing array
        employee_leave_dates[employee_code].append(leave_date)
    else:
        # Create a new array for the employee code
        employee_leave_dates[employee_code] = [leave_date]


# Create new columns for year and month
df['LeaveYear'] = df['Date'].dt.year

# Create a dictionary to store leave dates and reasons for each employee
employee_leave_info = {}

# Iterate through the DataFrame to populate the dictionary
for index, row in df.iterrows():
    employee_code = row['Code']
    leave_date = row['Date']
    leave_reason = row['Reason']

    # Check if the employee code is already in the dictionary
    if employee_code in employee_leave_info:
        # Append the leave date and reason to the existing 2D array
        employee_leave_info[employee_code].append([leave_date, leave_reason])
    else:
        # Create a new 2D array for the employee code
        employee_leave_info[employee_code] = [[leave_date, leave_reason]]

# Count consecutive leaves for each employee
consecutive_leave_counts = {}
df['NumOfLeaveDays'] = 1
for employee_code, leave_info in employee_leave_info.items():
    consecutive_count = 1  # Initialize count for consecutive leaves
    for i in range(1, len(leave_info)):
        current_leave_date, current_leave_reason = leave_info[i]
        prev_leave_date, prev_leave_reason = leave_info[i -1]

        # Check if the leave dates are consecutive and have the same reason
        if (current_leave_date - prev_leave_date).days == 1 and current_leave_reason == prev_leave_reason:
            consecutive_count += 1
        else:
            # Reset count if consecutive sequence is broken
            consecutive_count = 1
        df['NumOfLeaveDays'] = consecutive_count

    # Store the count in the dictionary
    consecutive_leave_counts[employee_code] = consecutive_count

# Create a new column to indicate if an employee took leave the next month
df['TookLeaveNextMonth'] = 0

# Iterate through the DataFrame to update the new column
for index, row in df.iterrows():
    employee_code = row['Code']
    leave_year = row['LeaveYear']
    leave_month = row['LeaveMonth']

    # Check if the employee code is in the dictionary
    if employee_code in employee_leave_dates:
        # Extract the year and month from the leave dates
        leave_years_months = [(date.year, date.month) for date in employee_leave_dates[employee_code]]

        # Check if the leave was taken in the next month and year
        if leave_month == 12:
            next_month = 1  # January of the next year
            next_year = leave_year + 1
        else:
            next_month = leave_month + 1
            next_year = leave_year

        if (next_year, next_month) in leave_years_months:
            df.at[index, 'TookLeaveNextMonth'] = 1

# Create a new column to store the number of leaves taken by each employee in the next month
df['LeavesNextMonth'] = 0

# Iterate through the DataFrame to update the new column
for index, row in df.iterrows():
    employee_code = row['Code']
    leave_year = row['LeaveYear']
    leave_month = row['LeaveMonth']

    # Check if the employee code is in the dictionary
    if employee_code in employee_leave_dates:
        # Extract the year and month from the leave dates
        leave_years_months = [(date.year, date.month) for date in employee_leave_dates[employee_code]]

        # Check if the leave was taken in the next month and year
        if leave_month == 12:
            next_month = 1  # January of the next year
            next_year = leave_year + 1
        else:
            next_month = leave_month + 1
            next_year = leave_year

        if (next_year, next_month) in leave_years_months:
            # Count the number of leaves taken in the next month
            leaves_next_month = leave_years_months.count((next_year, next_month))
            df.at[index, 'LeavesNextMonth'] = leaves_next_month

# Create a new column to represent the target variable categories
df['TargetCategory'] = pd.cut(df['LeavesNextMonth'], bins=[-1, 5, 10, 15, float('inf')],
                              labels=['A', 'B', 'C', 'D'], right=False)

# Drop rows with NaN values
df = df.dropna()

# Print intermediate results for debugging
print("Columns after preprocessing:")
print(df.head())

# Creating a mapping dictionary with the original values for clear understanding
reason_mapping = {
    'Reason': dict(zip(df['Encoded Reason'], df['Reason'])),
    'Status': dict(zip(df['Encoded Status'], df['Status'])),
    'Absenteeism_Type': dict(zip(df['Encoded Absenteeism Type'], df['Absenteeism Type'])),
    'Shift': dict(zip(df['Encoded Shift'], df['Shift']))
}

# Save the mapping dictionary to a text file
with open('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Ranidu_reason_mapping.txt', 'w') as f:
    for category, mappings in reason_mapping.items():
        f.write(f"{category}:\n")
        for encoded_value, original_value in mappings.items():
            f.write(f"  {encoded_value}: {original_value}\n")



# Save the new preprocessed data to a new excel file in Google Drive
df.to_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data_new.xlsx', index=False)
print("Done")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
        Date    Shift   Code          Department Absenteeism Type    Status  \
0 2021-12-01  Shift A  AA369  Team - MAT 4A - BD         Informed  Notified   
1 2021-12-01  Shift A  AA362  Team - MAT 4A - BD         Informed  Notified   
2 2021-12-01  Shift A  AA359  Team - MAT 4A - BD         Informed  Notified   
3 2021-12-01  Shift A  AA541  Team - MAT 4A - BD         Informed  Notified   
4 2021-12-01  Shift A  AA398  Team - MAT 3A - BD         Informed  Notified   

   Leave Type Absent/Present     Reason Joined Date  ...  LeaveMonth  \
0         1.0            MAT  Maternity  2015-09-11  ...          12   
1         1.0            MAT  Maternity  2018-11-14  ...          12   
2         1.0            MAT  Maternity  2019-10-09  ...          12   
3         1.0            MAT  Maternity  2020-10-02  ...          12   
4         1.0            MAT  Matern

##Making model using manual spliting dataset

In [21]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data_new.xlsx')
# print(df)
# Features and target variable

features = ['Encoded Code', 'Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift', 'DaysWorked', 'DayOfWeek', 'LeaveMonth', 'LeaveYear']

# Filter data until December 2022 for training
training_data = df[df['Date'] <= '2023-10-30']

# Features and target variable for training
X_train = training_data[features]
y_train = training_data['TookLeaveNextMonth']

# Initialize the model
model = RandomForestClassifier()

# Train the model on the entire dataset until December 2022
model.fit(X_train, y_train)

# Make predictions for January 2023
testing_data = df[(df['Date'] > '2023-10-30') & (df['Date'] < '2023-12-01')]
X_test = testing_data[features]
y_test = testing_data['TookLeaveNextMonth']
# y_leave_prob_jan_2023 = model.predict_proba(X_nov_2023)[:, 1]
# y_noLeave_prob_jan_2023 = model.predict_proba(X_nov_2023)[:, 0]
y_pred_test = model.predict(X_test)

# # Extract rows with positive predictions
# positive_predictions_indices = (y_pred_jan_2023 == 1)
# positive_predictions_data = jan_2023_data.loc[positive_predictions_indices]

# # Extract employee codes with positive predictions
# employee_codes_positive_predictions = positive_predictions_data['Code']
# # Print the employee codes with positive predictions
# print("num Employee codes with positive predictions:")
# print(len(employee_codes_positive_predictions.unique()))

# # Define probability threshold
# threshold = 0.80  # Adjust as needed

# # Filter employees with probability above the threshold
# predicted_leave_employees_jan_2023 = jan_2023_data.loc[y_leave_prob_jan_2023 > threshold, 'Code'].unique()

# # Display the result
# num_employees = len(predicted_leave_employees_jan_2023)
# print(f"Number of employees predicted to take leave in Dec 2023 with {threshold*100}% probability: {num_employees}")
# print(predicted_leave_employees_jan_2023)
# # print(predicted_leave_employees_jan_2023)



# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("\nConfusion Matrix:")
print(conf_matrix)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))

# # Identify and display employee codes where the prediction was wrong
# wrong_predictions = jan_2023_data.loc[y_jan_2023 != y_pred_jan_2023, 'Code'].unique()
# print(f'Number of wrong predictions{len(wrong_predictions)}')
# print("\nEmployee codes with wrong predictions:")
# print(wrong_predictions)

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred_test)
print(f"\nAccuracy for predicting nov 2023: {accuracy*100}%")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Confusion Matrix:
[[ 724  381]
 [ 500 1348]]

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.66      0.62      1105
           1       0.78      0.73      0.75      1848

    accuracy                           0.70      2953
   macro avg       0.69      0.69      0.69      2953
weighted avg       0.71      0.70      0.70      2953


Accuracy for predicting nov 2023: 70.16593294954284%


##Model after randomly spliting train test

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data_new.xlsx')
print(df)

# Features and target variable
features = ['Encoded Code', 'Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift', 'DaysWorked', 'DayOfWeek', 'LeaveMonth', 'LeaveYear']
X = df[features]
y = df['TookLeaveNextMonth']

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model on the training dataset
model.fit(X_train, y_train)

# Make predictions for the testing dataset
y_pred_test = model.predict(X_test)

# Display confusion matrix
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
print("\nConfusion Matrix (Test Set):")
print(conf_matrix_test)

# Display classification report
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred_test)
print(f"\nAccuracy : {accuracy*100}%")

# # Evaluate the model on the entire dataset until December 2022
# jan_2023_data = df[df['Date'].dt.to_period("M") == '2023-11']
# X_jan_2023 = jan_2023_data[features]
# y_leave_prob_jan_2023 = model.predict_proba(X_jan_2023)[:, 1]
# y_noLeave_prob_jan_2023 = model.predict_proba(X_jan_2023)[:, 0]
# y_pred_jan_2023 = model.predict(X_jan_2023)

# # Extract rows with positive predictions
# positive_predictions_indices = (y_pred_jan_2023 == 1)
# positive_predictions_data = jan_2023_data.loc[positive_predictions_indices]

# # Extract employee codes with positive predictions
# employee_codes_positive_predictions = positive_predictions_data['Code']
# # Print the employee codes with positive predictions
# print("\nNumber of Employee codes with positive predictions:")
# print(len(employee_codes_positive_predictions.unique()))

# # Define probability threshold
# threshold = 0.80  # Adjust as needed

# # Filter employees with probability above the threshold
# predicted_leave_employees_jan_2023 = jan_2023_data.loc[y_leave_prob_jan_2023 > threshold, 'Code'].unique()

# # Display the result
# num_employees = len(predicted_leave_employees_jan_2023)
# print(f"\nNumber of employees predicted to take leave in Dec 2023 with {threshold*100}% probability: {num_employees}")
# print(predicted_leave_employees_jan_2023)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
            Date    Shift    Code                   Department  \
0     2021-12-01  Shift A   AA369           Team - MAT 4A - BD   
1     2021-12-01  Shift A   AA362           Team - MAT 4A - BD   
2     2021-12-01  Shift A   AA359           Team - MAT 4A - BD   
3     2021-12-01  Shift A   AA541           Team - MAT 4A - BD   
4     2021-12-01  Shift A   AA398           Team - MAT 3A - BD   
...          ...      ...     ...                          ...   
90552 2023-12-22  Shift A  AA3992  Sewing Team - 142A - I - BD   
90553 2023-12-22  Shift A  AA3659  Sewing Team - 140A - I - BD   
90554 2023-12-22  Shift A  AA3845           Team - MAT 1A - BD   
90555 2023-12-22  Shift A  AA3984           Team - MAT 4A - BD   
90556 2023-12-22  Shift A  AA4098           Team - MAT 4A - BD   

      Absenteeism Type    Status  Leave Type Absent/Present           Reason  