<a href="https://colab.research.google.com/github/DSGP-Group-1-EAPS/SL-Apparel-Dataset/blob/Sandun-Karunatilleke/Logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Predicting using Logistic Regression**

# Data cleaning and preprocessing


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from google.colab import drive

# Mounting the Google Drive
drive.mount('/content/drive')

# Reading the data from the excel file and removing blank spaces in each cell
#/content/drive/MyDrive/Colab Notebooks/Ranidu_preprocessed_dataset.xlsx
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Ranidu_preprocessed_dataset.xlsx')
#https://docs.google.com/spreadsheets/d/1-0cJu4Wt_S_8UaaTJ_w7go1nE9kONRG5/edit?usp=drive_link&ouid=105208527154871137997&rtpof=true&sd=true
#https://drive.google.com/drive/folders/1xz11vVUTE22tZMv4wlXLBjeaPeUQV_ri?usp=drive_link
# Display the first few rows of the dataset
print("Sample of the dataset:")
#print(data.head())

# Extract features and target variable
X = data[['YearsWorked','Encoded Reason','Encoded Absent/Present','Encoded Status','Encoded Absenteeism Type','Encoded Shift']]

#X = data.drop('TookLeaveNextMonth', axis=1)
y = data['TookLeaveNextMonth']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


# Drop non-numeric columns before scaling
X_train_numeric = X_train.select_dtypes(include=['number'])
X_test_numeric = X_test.select_dtypes(include=['number'])

# Feature scaling (optional but can be beneficial for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Sample of the dataset:

Model Evaluation:
Accuracy: 0.9854


# Creating Logistic regression model

In [None]:
# Create and train the logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)
probabilities = model.predict_proba(X_test)


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.4f}")



# Comparing individual accuracy with the prediction likelihood

In [11]:
# Make predictions on the test set
y_pred = model.predict(X_test)
probabilities = model.predict_proba(X_test)

# Initialize a counter for the high likelihood records
high_likelihood_count = 0

# Check accuracy and print code number and date for records with likelihood > 0.99
for i in range(len(X_test)):
    actual_class = y_test.iloc[i]
    predicted_class = y_pred[i]
    likelihood_class_1 = probabilities[i][1]

    # Check if the predicted class matches the actual class
    if actual_class == predicted_class:
        accuracy = 1
    else:
        accuracy = 0

    # Print code number and date if likelihood for class 1 is greater than 0.99
    if likelihood_class_1 > 0.99:
        code_number = data['Code'].iloc[X_test.index[i]]
        record_date = data['Date'].iloc[X_test.index[i]]
        print(f"Record {i+1}: Code {code_number}, Date: {record_date}, Actual: {actual_class}, Predicted: {predicted_class}, Accuracy: {accuracy}, probability: {likelihood_class_1:.4f}")
        high_likelihood_count += 1

# Print total count of records with high likelihood
print(f"\nTotal Records with Likelihood > 0.99: {high_likelihood_count}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Record 3309: Code AA3067, Date: 2022-11-08 00:00:00, Actual: 1, Predicted: 1, Accuracy: 1, probability: 0.9939
Record 3310: Code AA2515, Date: 2022-10-11 00:00:00, Actual: 1, Predicted: 1, Accuracy: 1, probability: 0.9915
Record 3311: Code AA868, Date: 2021-12-03 00:00:00, Actual: 1, Predicted: 1, Accuracy: 1, probability: 0.9955
Record 3312: Code AA645, Date: 2022-06-01 00:00:00, Actual: 1, Predicted: 1, Accuracy: 1, probability: 0.9926
Record 3313: Code AA1428, Date: 2022-02-18 00:00:00, Actual: 1, Predicted: 1, Accuracy: 1, probability: 0.9914
Record 3314: Code AA1710, Date: 2022-01-14 00:00:00, Actual: 1, Predicted: 1, Accuracy: 1, probability: 0.9943
Record 3315: Code AA3617, Date: 2022-06-08 00:00:00, Actual: 1, Predicted: 1, Accuracy: 1, probability: 0.9928
Record 3316: Code AA1771, Date: 2022-09-15 00:00:00, Actual: 1, Predicted: 1, Accuracy: 1, probability: 0.9933
Record 3317: Code AA3603, Date: 2022-07-04 00:00: