<a href="https://colab.research.google.com/github/DSGP-Group-1-EAPS/SL-Apparel-Dataset/blob/Ranidu-Gurusinghe/Neural_Network_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Model Code by spliting dataset manually

In [3]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from google.colab import drive

drive.mount('/content/drive')
# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Dataset/preprocessed_data_new.xlsx')
df = df[df['Date'] <= '2023-12-01']

# Features and target variable
features = ['Encoded Code', 'Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift', 'DaysWorked', 'DayOfWeek', 'LeaveMonth']

# Split the data until November 2023 for training and use December 2023 data for testing
training_data = df[df['Date'] <= '2023-01-31']
testing_data = df[df['Date'] > '2023-02-28']

# Features and target variable for training
X_train = training_data[features]
y_train = training_data['TargetCategory']

# Features and target variable for testing
X_test = testing_data[features]
y_test = testing_data['TargetCategory']

# Standardize the features (important for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the neural network classifier with 3 hidden layers
clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=200, activation='relu',  alpha=0.01, learning_rate='constant', random_state=42)

# Train the neural network on the training data
clf.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test_scaled)
y_probs = clf.predict_proba(X_test_scaled)[:, 3]  # Probability of class 'D'

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Probability': y_probs  # Probability of class 'D'
})

print(results_df)
print(f"\nAccuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Display all instances where the predicted class is 'D'
predicted_D_instances = results_df[results_df['Predicted'] == 'D']

# Display the employee codes for those instances
employee_codes_predicted_D = testing_data.loc[predicted_D_instances.index, 'Code'].unique()
print("Employee codes with predicted class 'D':")
print(employee_codes_predicted_D)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


      Actual Predicted  Probability
69536      A         A     0.227473
69537      A         A     0.067187
69538      A         A     0.080638
69539      A         A     0.012784
69540      A         C     0.205757
...      ...       ...          ...
88721      A         A     0.044295
88722      A         A     0.069784
88723      A         A     0.037748
88724      A         A     0.043802
88725      A         D     0.719395

[19119 rows x 3 columns]

Accuracy: 0.7286991997489408

Confusion Matrix:
[[12173     0   122  2068]
 [  590     0     7   512]
 [  303     0     2   622]
 [  949     0    14  1757]]

Classification Report:
              precision    recall  f1-score   support

           A       0.87      0.85      0.86     14363
           B       0.00      0.00      0.00      1109
           C       0.01      0.00      0.00       927
           D       0.35      0.65      0.46      2720

    accuracy                           0.73     19119
   macro avg       0.31      0.37 

  _warn_prf(average, modifier, msg_start, len(result))


##Hyperparameter tunning

In [2]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/Dataset/preprocessed_data_new.xlsx')
df = df[df['Date'] <= '2023-12-01']
# Features and target variable
features = ['Encoded Code', 'Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift', 'DaysWorked', 'DayOfWeek', 'LeaveMonth']

# Split the data until November 2023 for training and use December 2023 data for testing
training_data = df[df['Date'] <= '2023-10-30']

# Features and target variable for training
X_train = training_data[features]
y_train = training_data['TargetCategory']

# Features and target variable for testing
testing_data = df[(df['Date'] > '2023-10-30') & (df['Date'] < '2023-12-01')]
X_test = testing_data[features]
y_test = testing_data['TargetCategory']

# Standardize the features (important for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    # 'hidden_layer_sizes': [(50,), (100, 50), (50, 25, 10)],
    # 'activation': ['relu', 'tanh'],
    # 'solver': ['adam', 'sgd'],
    # 'alpha': [0.0001, 0.001, 0.01],
    # 'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [200, 500, 1000],


}

# Initialize the neural network classifier
clf = MLPClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_clf = MLPClassifier(random_state=42, **best_params)
best_clf.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = best_clf.predict(X_test_scaled)
y_probs = best_clf.predict_proba(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)




print("\nBest Hyperparameters:")
print(best_params)
print(f"\nAccuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).





Best Hyperparameters:
{'max_iter': 200}

Accuracy: 0.6996274974602099

Confusion Matrix:
[[1540    0    2  268]
 [ 102    0    0   99]
 [ 130    0    0   67]
 [ 218    0    1  526]]

Classification Report:
              precision    recall  f1-score   support

           A       0.77      0.85      0.81      1810
           B       0.00      0.00      0.00       201
           C       0.00      0.00      0.00       197
           D       0.55      0.71      0.62       745

    accuracy                           0.70      2953
   macro avg       0.33      0.39      0.36      2953
weighted avg       0.61      0.70      0.65      2953



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data_new.xlsx')
df = df[df['Date'] <= '2023-12-01']
# Features and target variable
features = ['Encoded Code', 'Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift', 'DaysWorked', 'DayOfWeek', 'LeaveMonth']

# Split the data until November 2023 for training and use December 2023 data for testing
training_data = df[df['Date'] <= '2023-10-30']

# Features and target variable for training
X_train = training_data[features]
y_train = training_data['TargetCategory']

# Features and target variable for testing
testing_data = df[(df['Date'] > '2023-10-30') & (df['Date'] < '2023-12-01')]
X_test = testing_data[features]
y_test = testing_data['TargetCategory']

# Standardize the features (important for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the neural network classifier with 3 hidden layers
clf = MLPClassifier(hidden_layer_sizes=(100, 75, 50, 25), max_iter=500)

# Train the neural network on the training data
clf.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test_scaled)
y_probs = clf.predict_proba(X_test_scaled)[:, 1]  # Probability of class 1 (positive class)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Create a DataFrame to display predictions and probabilities
result_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Probability': y_probs
})

print(result_df)
print(f"\nAccuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)


Mounted at /content/drive
      Actual Predicted   Probability
85510      A         A  1.332678e-05
85511      A         A  3.213108e-05
85512      A         A  2.718067e-02
85513      A         A  1.016040e-08
85514      A         A  1.475455e-02
...      ...       ...           ...
88519      A         A  1.436458e-02
88520      D         D  3.537994e-07
88521      D         D  1.547219e-06
88522      D         D  2.423821e-05
88523      A         D  4.133448e-05

[2953 rows x 3 columns]

Accuracy: 0.6203860480866915

Confusion Matrix:
[[1467   90   22  231]
 [ 124    8   17   52]
 [ 108    2    0   87]
 [ 337    5   46  357]]

Classification Report:
              precision    recall  f1-score   support

           A       0.72      0.81      0.76      1810
           B       0.08      0.04      0.05       201
           C       0.00      0.00      0.00       197
           D       0.49      0.48      0.49       745

    accuracy                           0.62      2953
   macro avg 

In [None]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the preprocessed data
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/DSGP_COURSEWORK/SL Apparel Dataset model/preprocessed_data_new.xlsx')
print(df)
df = df[df['Date'] <= '2023-12-01']

# Features and target variable
features = ['Encoded Code', 'Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift', 'DaysWorked', 'DayOfWeek', 'LeaveMonth']
X = df[features]
y = df['TargetCategory']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the neural network classifier with 3 hidden layers
clf = MLPClassifier(hidden_layer_sizes=(100,75, 50, 25), max_iter=500, random_state=42)

# Train the model
clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print(f'Accuracy: {accuracy * 100}%')
print('\nConfusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_rep)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
            Date    Shift    Code                   Department  \
0     2021-12-01  Shift A   AA369           Team - MAT 4A - BD   
1     2021-12-01  Shift A   AA362           Team - MAT 4A - BD   
2     2021-12-01  Shift A   AA359           Team - MAT 4A - BD   
3     2021-12-01  Shift A   AA541           Team - MAT 4A - BD   
4     2021-12-01  Shift A   AA398           Team - MAT 3A - BD   
...          ...      ...     ...                          ...   
90552 2023-12-22  Shift A  AA3992  Sewing Team - 142A - I - BD   
90553 2023-12-22  Shift A  AA3659  Sewing Team - 140A - I - BD   
90554 2023-12-22  Shift A  AA3845           Team - MAT 1A - BD   
90555 2023-12-22  Shift A  AA3984           Team - MAT 4A - BD   
90556 2023-12-22  Shift A  AA4098           Team - MAT 4A - BD   

      Absenteeism Type    Status  Leave Type Absent/Present           Reason  

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Features and target variable
features = ['Encoded Code', 'Encoded Reason', 'Encoded Status', 'Encoded Absenteeism Type', 'Encoded Shift', 'DaysWorked', 'DayOfWeek', 'LeaveMonth']
df = df[df['Date'] <= '2023-12-01']
# Split the data until November 2023 for training and use December 2023 data for testing
training_data = df[df['Date'] <= '2023-10-30']
testing_data = df[df['Date'] > '2023-10-30']

# Features and target variable for training
X_train = training_data[features]
y_train = training_data['TargetCategory']

# Features and target variable for testing
X_test = testing_data[features]
y_test = testing_data['TargetCategory']

# Standardize the features (important for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the neural network classifier with 3 hidden layers
clf = MLPClassifier(hidden_layer_sizes=(100, 75, 50, 25), max_iter=500)

# Define the hyperparameters to tune
param_grid = {
    'hidden_layer_sizes': [(100,), (100, 50), (100, 75, 50)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")


NameError: name 'df' is not defined