<a href="https://colab.research.google.com/github/DSGP-Group-1-EAPS/SL-Apparel-Dataset/blob/Ranidu-Gurusinghe/Binary_Classification_Model_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Preprocessing

In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

# Mounting the Google Drive
drive.mount('/content/drive')

# Reading the data from the excel file and removing blank spaces in each cell
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/SL Apparel Industry Dataset.xlsx').apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Reading the JoinDates.xlsx file
join_dates_df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/JoinedDates.xlsx')

# Merging the SL Apparel Industry Dataset with the Join Dates based on the 'Code' column
df = pd.merge(df, join_dates_df, on='Code', how='left')
print(df.head())

df['Joined Date'] = pd.to_datetime(df['Joined Date'])
# Calculating the number of years each employee has worked
df['DaysWorked'] = (df['Date'] - df['Joined Date']).dt.days

# Making the 'Absenteeism Type' considered as String
df['Absenteeism Type'] = df['Absenteeism Type'].astype(str)

# Add a column for the day of the week (1-7)
df['DayOfWeek'] = df['Date'].dt.dayofweek + 1

# Add a column for the month (1-12)
df['Month'] = df['Date'].dt.month
print(df.head())
# Removing rows with 'Resignation', 'VOP', 'Funeral' from the 'Reason' column
reasons_to_remove = ['Resignation', 'VOP', 'Funeral']
df = df[~df['Reason'].isin(reasons_to_remove)]

# Removing rows where 'Leave Type' is 0.5
df = df[df['Leave Type'] != 0.5]

# Removing all the rows except for 'Shift A' and 'Shift B' in the 'Shift' column
df = df[df['Shift'] != 'A']
df = df[df['Shift'] != 'Shift']
df = df[df['Shift'] != 'Other Teams']
df = df[df['Shift'] != 'Shift e']

# Removing 'Department' and 'Reason Mapping' columns
df = df.drop(['Reason Mapping'], axis=1)

# Removing rows where 'Absenteeism Type' is 1
df = df[df['Absenteeism Type'] != '1']

# Using LabelEncoder to encode the values in columns
label_encoder = LabelEncoder()
df['Encoded Reason'] = label_encoder.fit_transform(df['Reason'])
df['Encoded Status'] = label_encoder.fit_transform(df['Status'])
df['Encoded Absenteeism Type'] = label_encoder.fit_transform(df['Absenteeism Type'])
df['Encoded Shift'] = label_encoder.fit_transform(df['Shift'])
df['TookLeaveNextMonth'] = (df['Absent/Present'].eq('Absent') & (df['Date'].dt.month != (df['Date'] + pd.DateOffset(months=1)).dt.month)).astype(int)

# Removing the rows if it has a blank cell in it
df = df.dropna()
# Print intermediate results for debugging
print("Columns after preprocessing:")
print(df.head())

# Creating a mapping dictionary with the original values for clear understanding
reason_mapping = {
    'Reason': dict(zip(df['Encoded Reason'], df['Reason'])),
    'Status': dict(zip(df['Encoded Status'], df['Status'])),
    'Absenteeism_Type': dict(zip(df['Encoded Absenteeism Type'], df['Absenteeism Type'])),
    'Shift': dict(zip(df['Encoded Shift'], df['Shift']))
}

# Save the mapping dictionary to a text file
with open('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Ranidu_reason_mapping.txt', 'w') as f:
    for category, mappings in reason_mapping.items():
        f.write(f"{category}:\n")
        for encoded_value, original_value in mappings.items():
            f.write(f"  {encoded_value}: {original_value}\n")

# Assuming 'Date' is a datetime column
# df['TookLeaveNextMonth'] = df.apply(lambda row: 1 if any((df['Code'] == row['Code']) & (df['Date'] == (row['Date'] + pd.DateOffset(months=1)))) else 0, axis=1)
# Add a column indicating whether the employee took a leave in the next month



# Print intermediate results for debugging
print("Columns before saving to Excel:")
print(df.head())

# Save the new preprocessed data to a new excel file in Google Drive
df.to_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data_new.xlsx', index=False)
print("Done")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
        Date    Shift   Code          Department Absenteeism Type    Status  \
0 2021-12-01  Shift A  AA369  Team - MAT 4A - BD         Informed  Notified   
1 2021-12-01  Shift A  AA362  Team - MAT 4A - BD         Informed  Notified   
2 2021-12-01  Shift A  AA359  Team - MAT 4A - BD         Informed  Notified   
3 2021-12-01  Shift A  AA541  Team - MAT 4A - BD         Informed  Notified   
4 2021-12-01  Shift A  AA398  Team - MAT 3A - BD         Informed  Notified   

   Reason Mapping  Leave Type Absent/Present     Reason Joined Date  
0             NaN         1.0            MAT  Maternity  2015-09-11  
1             NaN         1.0            MAT  Maternity  2018-11-14  
2             NaN         1.0            MAT  Maternity  2019-10-09  
3             NaN         1.0            MAT  Maternity  2020-10-02  
4             NaN         1.0            MAT  

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from google.colab import drive

# Mounting the Google Drive
drive.mount('/content/drive')

# Reading the data from the excel file and removing blank spaces in each cell
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/SL Apparel Industry Dataset.xlsx').apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Making the 'Absenteeism Type' considered as String
df['Absenteeism Type'] = df['Absenteeism Type'].astype(str)

# Removing rows with 'Resignation', 'VOP', 'Funeral' from the 'Reason' colunm
reasons_to_remove = ['Resignation', 'VOP', 'Funeral']
df = df[~df['Reason'].isin(reasons_to_remove)]

# Removing rows where 'Leave Type' is 0.5
df = df[df['Leave Type'] != 0.5]

# Removing all the rows except for 'Shift A' and 'Shift B' in the 'Shift' colunm
df = df[df['Shift'] != 'A']
df = df[df['Shift'] != 'Shift']
df = df[df['Shift'] != 'Other Teams']
df = df[df['Shift'] != 'Shift e']

# Removing 'Department' and 'Reason Mapping' columns
df = df.drop(['Department', 'Reason Mapping'], axis=1)

# Removing rows where 'Absent/Present' is 'Present'
df = df[df['Absent/Present'] != 'Present']

# Removing rows where 'Absenteeism Type' is 1
df = df[df['Absenteeism Type'] != '1']

# Removing the rows if it has a blank cell in it
df = df.dropna()

# Using LabelEncoder to encode the values in columns
label_encoder = LabelEncoder()
df['Encoded Reason'] = label_encoder.fit_transform(df['Reason'])
df['Encoded Absent/Present'] = label_encoder.fit_transform(df['Absent/Present'])
df['Encoded Status'] = label_encoder.fit_transform(df['Status'])
df['Encoded Absenteeism Type'] = label_encoder.fit_transform(df['Absenteeism Type'])
df['Encoded Shift'] = label_encoder.fit_transform(df['Shift'])

# Creating a mapping dictionary with the original values for clear understanding
reason_mapping = {
    'Reason': dict(zip(df['Encoded Reason'], df['Reason'])),
    'Absent_Present': dict(zip(df['Encoded Absent/Present'], df['Absent/Present'])),
    'Status': dict(zip(df['Encoded Status'], df['Status'])),
    'Absenteeism_Type': dict(zip(df['Encoded Absenteeism Type'], df['Absenteeism Type'])),
    'Shift': dict(zip(df['Encoded Shift'], df['Shift']))
}

# Saving the mapping dictionary to a text file
with open('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/reason_mapping.txt', 'w') as f:
    for category, mappings in reason_mapping.items():
        f.write(f"{category}:\n")
        for encoded_value, original_value in mappings.items():
            f.write(f"  {encoded_value}: {original_value}\n")

# Save the new preprocessed data to a text file in google drive
df.to_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data.xlsx', index=False)
print("Done")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Done


##Modelling

##Separating the datset into induvidual dataframes for each month

In [None]:
import pandas as pd

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Ranidu_preprocessed_dataset.xlsx')

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Separate the dataset into monthly sets
unique_months = pd.to_datetime(df['Date']).dt.to_period("M").unique()
monthly_datasets = {month: df[df['Date'].dt.to_period("M") == month] for month in unique_months}

# Define a function to get the encoded columns for a given month
def get_encoded_data(month_data):
    encoded_columns = ['Code', 'Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift']
    return month_data[encoded_columns]

# Example: Access the DataFrame for December 2022 with only encoded columns
dec_2022_data = get_encoded_data(monthly_datasets[pd.Period('2021-12')])

# Repeat the above for every month
jan_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-01')])
feb_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-02')])
mar_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-03')])
apr_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-04')])
may_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-05')])
jun_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-06')])
jul_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-07')])
aug_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-08')])
sep_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-09')])
oct_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-10')])
nov_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-11')])
dec_2022_data = get_encoded_data(monthly_datasets[pd.Period('2022-12')])
jan_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-01')])
feb_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-02')])
mar_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-03')])
apr_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-04')])
may_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-05')])
jun_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-06')])
jul_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-07')])
aug_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-08')])
sep_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-09')])
oct_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-10')])
nov_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-11')])
dec_2023_data = get_encoded_data(monthly_datasets[pd.Period('2023-12')])

# Identify employees who didn't take any leave in January 2023
no_jan_leave_employees = df[~df['Code'].isin(jan_2023_data['Code'])]['Code'].unique()

# Display the result
print("Employees who didn't take any leave in January 2023:")
print(no_jan_leave_employees)
print(len(no_jan_leave_employees))

Employees who didn't take any leave in January 2023:
['AA369' 'AA362' 'AA359' ... 'AA2313' 'AA3449' 'AA4244']
2963


##Making model

In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data_new.xlsx')

# Features and target variable
features = ['Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift', 'DaysWorked', 'DayOfWeek', 'Month']
X = df[features]
y = df['TookLeaveNextMonth']

# Filter data until December 2022 for training
training_data = df[df['Date'] <= '2022-12-31']

# Features and target variable for training
X_train = training_data[features]
y_train = training_data['TookLeaveNextMonth']

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model on the entire dataset until December 2022
model.fit(X_train, y_train)

# Make predictions for January 2023
jan_2023_data = df[df['Date'].dt.to_period("M") == '2023-01']
X_jan_2023 = jan_2023_data[features]
y_prob_jan_2023 = model.predict_proba(X_jan_2023)[:, 1]

# Define probability threshold
threshold = 0.999  # Adjust as needed

# Filter employees with probability above the threshold
predicted_leave_employees_jan_2023 = jan_2023_data.loc[y_prob_jan_2023 > threshold, 'Code']

# Display the result
num_employees = len(predicted_leave_employees_jan_2023)
print(f"Number of employees predicted to take leave in January 2023 with {threshold*100}% probability: {num_employees}")
print(predicted_leave_employees_jan_2023)

# Add the data of January 2023 to the training dataset
training_data = pd.concat([training_data, jan_2023_data], ignore_index=True)

# Features and target variable for updated training dataset
X_train_updated = training_data[features]
y_train_updated = training_data['TookLeaveNextMonth']

# Retrain the model on the updated training dataset
model.fit(X_train_updated, y_train_updated)

# Evaluate the model on the test set (January 2023) to check accuracy
y_test_jan_2023 = jan_2023_data['TookLeaveNextMonth']
X_test_jan_2023 = jan_2023_data[features]
y_pred_jan_2023 = model.predict(X_test_jan_2023)

# Display confusion matrix
conf_matrix = confusion_matrix(y_test_jan_2023, y_pred_jan_2023)
print("\nConfusion Matrix:")
print(conf_matrix)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test_jan_2023, y_pred_jan_2023))

# Identify and display employee codes where the prediction was wrong
wrong_predictions = jan_2023_data.loc[y_test_jan_2023 != y_pred_jan_2023, 'Code']
print("\nEmployee codes with wrong predictions:")
print(wrong_predictions)

# Evaluate the model on the test set
accuracy_jan_2023 = accuracy_score(y_test_jan_2023, y_pred_jan_2023)
print(f"\nAccuracy for predicting January 2023: {accuracy_jan_2023}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of employees predicted to take leave in January 2023 with 99.9% probability: 2455
64470    AA2316
64473    AA1465
64474    AA3790
64476    AA2943
64477    AA2362
          ...  
68230     AA940
68231    AA2000
68232    AA3980
68233     AA325
68234     AA326
Name: Code, Length: 2455, dtype: object

Confusion Matrix:
[[ 211    4]
 [   1 3550]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       215
           1       1.00      1.00      1.00      3551

    accuracy                           1.00      3766
   macro avg       1.00      0.99      0.99      3766
weighted avg       1.00      1.00      1.00      3766


Employee codes with wrong predictions:
65066    AA2070
67421    AA2279
67688    AA1177
68035    AA1177
68124    AA3122
Name: Code, dtype: object

Accuracy for predictin

In [None]:
# Predict probabilities
y_probabilities = model.predict_proba(X_test)[:, 1]

# Identify highly likely employees (probability > 0.8)
threshold = 1
highly_likely_employees = df.loc[y_test.index][y_probabilities == threshold]

# Display highly likely employees
print("Highly Likely Employees:")
print(highly_likely_employees)

Highly Likely Employees:
            Date    Shift    Code Absenteeism Type        Status  Leave Type  \
86828 2023-11-17  Shift A  AA1276       Uninformed      Notified           1   
83451 2023-09-21  Shift A  AA1964       Uninformed      Notified           1   
46915 2022-08-16  Shift B  AA2668         Informed      Notified           1   
83229 2023-10-03  Shift B  AA1989       Uninformed      Notified           1   
11069 2022-01-27  Shift B   AA238       Uninformed  Not Notified           1   
...          ...      ...     ...              ...           ...         ...   
45719 2022-08-02  Shift B  AA3722         Informed      Notified           1   
87328 2023-11-23  Shift A  AA3948       Uninformed      Notified           1   
3757  2021-12-19  Shift B   AA609         Informed      Notified           1   
12418 2022-02-02  Shift B  AA2782         Informed      Notified           1   
72360 2023-04-21  Shift B  AA1138         Informed      Notified           1   

      Absent/P

##Different code

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data.xlsx')

# Create a target variable (e.g., took leave in the next month)
df['TookLeaveNextMonth'] = df['Absent/Present'].eq('Absent').shift(-1)

# Features and target variable
features = ['Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift']
X = df[features]
y = df['TookLeaveNextMonth']

# Handle missing values in the target variable
y = y.fillna(0)  # Fill missing values with 0 or choose an appropriate strategy

# Ensure binary classes in the target variable
y = y.astype(int)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_prob = model.predict_proba(X_test)[:, 1]

# Define probability threshold
threshold = 0.8

# Filter employees with probability above the threshold
highly_likely_employees = df.loc[X_test.index, ['Code', 'Encoded Absent/Present', 'Leave Type', 'Reason']]
highly_likely_employees['Probability'] = y_prob

# Display the result
result = highly_likely_employees[highly_likely_employees['Probability'] > threshold]
print("Highly Likely Employees:")
print(result[['Code', 'Probability']])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Highly Likely Employees:
         Code  Probability
51273  AA1020     0.995360
75297  AA2889     0.981685
49296   AA745     0.989119
40999   AA933     0.993470
70357  AA1214     0.968826
...       ...          ...
32027  AA2810     0.995579
87640  AA2925     0.987327
57120   AA603     0.974451
32529  AA3645     0.995579
72360  AA1138     1.000000

[14668 rows x 2 columns]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data.xlsx')

# Create a target variable (e.g., took leave in the next month)
df['TookLeaveNextMonth'] = df['Absent/Present'].eq('Absent').shift(-1)

# Features and target variable
features = ['Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift']
X = df[features]
y = df['TookLeaveNextMonth']

# Handle missing values in the target variable
y = y.fillna(0)  # Fill missing values with 0 or choose an appropriate strategy
print(y)
# Ensure binary classes in the target variable
y = y.astype(int)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Define probability threshold
threshold = 0.8

# Filter employees with probability above the threshold
highly_likely_employees = df.loc[X_test.index, ['Code', 'Encoded Absent/Present', 'Leave Type', 'Reason']]
highly_likely_employees['Probability'] = y_prob

# Display the result
result = highly_likely_employees[highly_likely_employees['Probability'] > threshold]

# Display accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display highly likely employees
print("\nHighly Likely Employees:")
print(result[['Code', 'Probability']])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
0        False
1        False
2        False
3        False
4        False
         ...  
90069     True
90070     True
90071     True
90072     True
90073        0
Name: TookLeaveNextMonth, Length: 90074, dtype: object
Accuracy: 0.91
Confusion Matrix:
[[    8  1606]
 [   10 16391]]
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.00      0.01      1614
           1       0.91      1.00      0.95     16401

    accuracy                           0.91     18015
   macro avg       0.68      0.50      0.48     18015
weighted avg       0.87      0.91      0.87     18015


Highly Likely Employees:
         Code  Probability
51273  AA1020     0.995360
75297  AA2889     0.981685
49296   AA745     0.989119
40999   AA933     0.993470
70357  AA1214     0.968826
...       ...          ...
32027  AA2810     0.99