In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'https://raw.githubusercontent.com/FaisalAbid11/Permutation-Entropy-vs-Modified-TOPSIS/refs/heads/main/Permutation-based-entropy/new%20neumerical.csv'
data = pd.read_csv(file_path)

# Define a function to transform all categorical columns to numerical values
def transform_categorical_to_numerical(df):
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns

    # Apply Label Encoding to all categorical columns
    label_encoder = LabelEncoder()
    for col in categorical_columns:
        df[col + '_Encoded'] = label_encoder.fit_transform(df[col].astype(str))

    return df

# Transform the dataset
data_transformed = transform_categorical_to_numerical(data)

# Save or display the transformed dataset
print(data_transformed.head())


   Age  Gender  Work tenure  Education  Job position  working hour  \
0    2       1            2          3             1             2   
1    2       1            2          3             2             1   
2    4       2            3          3             1             3   
3    3       1            3          4             2             2   
4    3       1            3          4             3             1   

   satisfaction with workload  satisfied  compensation   \
0                           4                         3   
1                           2                         2   
2                           1                         2   
3                           2                         2   
4                           2                         3   

    good relationship with peers    satisfied with career and  opportunity  \
0                               4                                        4   
1                               2                                   

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load dataset
data = pd.read_csv('https://raw.githubusercontent.com/FaisalAbid11/Permutation-Entropy-vs-Modified-TOPSIS/refs/heads/main/Permutation-based-entropy/new%20neumerical.csv')  # Replace with your file path


# Separate features and target
X = data.drop(columns=["TOI (turnover intention)"])  # Replace "target_column" with your target variable
y = data["TOI (turnover intention)"]

# Identify categorical and numeric features
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Apply one-hot encoding to categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ],
    remainder="passthrough"
)

# Transform features
X_processed = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# Train a RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Compute permutation importance
perm_importance = permutation_importance(rf_model, X_test, y_test, n_repeats=10, random_state=42)

# Map feature importance back to feature names
feature_names = preprocessor.get_feature_names_out()
importance_scores = pd.DataFrame({
    "Feature": feature_names,
    "Importance": perm_importance.importances_mean
}).sort_values(by="Importance", ascending=False)

# Display the top features
print(importance_scores.head(17))  # Top 5 features


                                              Feature  Importance
6               remainder__satisfaction with workload    0.051887
9   remainder__ satisfied with career and  opportu...    0.021698
7                 remainder__satisfied  compensation     0.019811
11            remainder__ monthly average expenditure    0.009434
15  remainder__ mentally well and do not have anxi...    0.006604
14                        remainder__family supports     0.002830
2                              remainder__Work tenure    0.001887
8           remainder__ good relationship with peers     0.001887
10          remainder__satisfied with job profession     0.000943
4                             remainder__Job position    0.000000
12      remainder__ satisfied with  work-life balance    0.000000
0                                      remainder__Age   -0.000943
13                    remainder__ work is meaningful    -0.000943
3                                remainder__Education   -0.001887
5         

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load dataset
data = pd.read_csv('https://raw.githubusercontent.com/FaisalAbid11/Permutation-Entropy-vs-Modified-TOPSIS/refs/heads/main/Permutation-based-entropy/new%20neumerical.csv')  # Replace with your file path


# Separate features and target
X = data.drop(columns=["TOI (turnover intention)"])  # Replace "target_column" with your target variable
y = data["TOI (turnover intention)"]


# Identify categorical and numeric features
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Apply one-hot encoding to categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ],
    remainder="passthrough"
)

# Transform features
X_processed = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# Train a RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Compute permutation importance
perm_importance = permutation_importance(rf_model, X_test, y_test, n_repeats=10, random_state=42)

# Map feature importance back to feature names
feature_names = preprocessor.get_feature_names_out()
importance_scores = pd.DataFrame({
    "Feature": feature_names,
    "Importance": perm_importance.importances_mean
}).sort_values(by="Importance", ascending=False)

# Handle negative importance values by explanation
importance_scores["Remark"] = importance_scores["Importance"].apply(
    lambda x: "May introduce noise" if x < 0 else "Useful"
)

# Display the results
print(importance_scores)

# Optionally, save the feature importance rankings to a CSV
importance_scores.to_csv('feature_importance_ranking.csv', index=False)


                                              Feature  Importance  \
6               remainder__satisfaction with workload    0.051887   
9   remainder__ satisfied with career and  opportu...    0.021698   
7                 remainder__satisfied  compensation     0.019811   
11            remainder__ monthly average expenditure    0.009434   
15  remainder__ mentally well and do not have anxi...    0.006604   
14                        remainder__family supports     0.002830   
2                              remainder__Work tenure    0.001887   
8           remainder__ good relationship with peers     0.001887   
10          remainder__satisfied with job profession     0.000943   
4                             remainder__Job position    0.000000   
12      remainder__ satisfied with  work-life balance    0.000000   
0                                      remainder__Age   -0.000943   
13                    remainder__ work is meaningful    -0.000943   
3                                r

In [5]:
import numpy as np
import pandas as pd

# Dataset from the user
data = np.array([
    [2, 3, 1, 3, 0, 1, 4, 1, 1, 8],
    [3, 2, 2, 3, 1, 0, 1, 0, 2, 5],
    [3, 2, 2, 0, 3, 0, 2, 0, 2, 6],
    [3, 2, 2, 2, 3, 0, 2, 0, 2, 6],
    [1, 2, 2, 0, 0, 0, 1, 0, 3, 4],
    [1, 3, 2, 0, 3, 0, 4, 0, 2, 5],
    [2, 2, 1, 0, 0, 0, 1, 0, 2, 7],
    [1, 3, 2, 0, 3, 1, 2, 1, 3, 2],
    [2, 3, 1, 0, 3, 0, 1, 0, 2, 7],
    [1, 1, 2, 0, 3, 0, 2, 0, 2, 5]
])

# Column names
columns = [
    "satisfied  compensation",
    "satisfaction with workload",
    "satisfied with job profession",
    "mentally well and do not have anxity",
    "family supports",
    "satisfied with career and  opportunity",
    "Work tenure",
    "monthly average expenditure ",
    "good relationship with peers",
    "Job position"
]
# Step 1: Normalize the Decision Matrix
def normalize(X):
    norm_factors = np.sqrt(np.sum(X**2, axis=0))
    return X / norm_factors

normalized_data = normalize(data)

# Step 2: Calculate Proportions (p_ij)
p_ij = normalized_data / np.sum(normalized_data, axis=0)

# Step 3: Calculate Entropy (e_j)
def calculate_entropy(p_ij, n):
    p_ij = np.where(p_ij == 0, 1e-10, p_ij)  # Avoid log(0)
    entropy = -np.sum(p_ij * np.log(p_ij), axis=0) / np.log(n)
    return entropy

n, m = data.shape  # n: rows, m: columns
e_j = calculate_entropy(p_ij, n)

# Step 4: Calculate Weights
w_e1 = (1 - e_j) / np.sum(1 - e_j)
w_e2 = (1 / e_j) / np.sum(1 / e_j)

# Step 5: Combine Weights
alpha, beta = 0.5, 0.5  # Equal weight contributions
w_e = alpha * w_e1 + beta * w_e2

# Step 6: Compile Results into a DataFrame
results = pd.DataFrame({
    "Entropy (e_j)": e_j,
    "Weight w_e1(j)": w_e1,
    "Weight w_e2(j)": w_e2,
    "Final Weight w_e(j)": w_e
}, index=columns)

# Display the results
print("Entropy Weight Results:")
print(results)


Entropy Weight Results:
                                        Entropy (e_j)  Weight w_e1(j)  \
satisfied  compensation                      0.957687        0.018597   
satisfaction with workload                   0.981912        0.007950   
satisfied with job profession                0.982542        0.007673   
mentally well and do not have anxity         0.469992        0.232939   
family supports                              0.826744        0.076146   
satisfied with career and  opportunity       0.301030        0.307198   
Work tenure                                  0.939794        0.026461   
monthly average expenditure                  0.301030        0.307198   
good relationship with peers                 0.985212        0.006499   
Job position                                 0.978749        0.009340   

                                        Weight w_e2(j)  Final Weight w_e(j)  
satisfied  compensation                       0.064606             0.041601  
satisfaction wit

In [6]:
pip install python-docx


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/244.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [7]:
# Re-importing required libraries
import numpy as np
import pandas as pd
from docx import Document

# Dataset from the user
data = np.array([
    [2, 3, 1, 3, 0, 1, 4, 1, 1, 8],
    [3, 2, 2, 3, 1, 0, 1, 0, 2, 5],
    [3, 2, 2, 0, 3, 0, 2, 0, 2, 6],
    [3, 2, 2, 2, 3, 0, 2, 0, 2, 6],
    [1, 2, 2, 0, 0, 0, 1, 0, 3, 4],
    [1, 3, 2, 0, 3, 0, 4, 0, 2, 5],
    [2, 2, 1, 0, 0, 0, 1, 0, 2, 7],
    [1, 3, 2, 0, 3, 1, 2, 1, 3, 2],
    [2, 3, 1, 0, 3, 0, 1, 0, 2, 7],
    [1, 1, 2, 0, 3, 0, 2, 0, 2, 5]
])

# Column names
columns = [
    "satisfied  compensation",
    "satisfaction with workload",
    "satisfied with job profession",
    "mentally well and do not have anxity",
    "family supports",
    "satisfied with career and  opportunity",
    "Work tenure",
    "monthly average expenditure ",
    "good relationship with peers",
    "Job position"
]

# Normalize the data
normalized_data = data / np.sqrt(np.sum(data**2, axis=0))

# Calculate proportions
p_ij = normalized_data / np.sum(normalized_data, axis=0)

# Calculate entropy (e_j)
def calculate_entropy(p_ij, n):
    p_ij = np.where(p_ij == 0, 1e-10, p_ij)  # Avoid log(0)
    entropy = -np.sum(p_ij * np.log(p_ij), axis=0) / np.log(n)
    return entropy

n, m = data.shape  # n: rows, m: columns
e_j = calculate_entropy(p_ij, n)

# Calculate weights
w_e1 = (1 - e_j) / np.sum(1 - e_j)
w_e2 = (1 / e_j) / np.sum(1 / e_j)
alpha, beta = 0.5, 0.5
w_e = alpha * w_e1 + beta * w_e2

# Recreate the results DataFrame
results = pd.DataFrame({
    "Entropy (e_j)": e_j,
    "Weight w_e1(j)": w_e1,
    "Weight w_e2(j)": w_e2,
    "Final Weight w_e(j)": w_e
}, index=columns)

# Recreate the document
doc = Document()
doc.add_heading("Entropy Weight Results", level=1)

# Add a table with the results
table = doc.add_table(rows=1, cols=5)
table.style = 'Table Grid'

# Add headers to the table
headers = ["Column Name", "Entropy (e_j)", "Weight w_e1(j)", "Weight w_e2(j)", "Final Weight w_e(j)"]
header_cells = table.rows[0].cells
for i, header in enumerate(headers):
    header_cells[i].text = header

# Add rows to the table
for idx, row in results.iterrows():
    row_cells = table.add_row().cells
    row_cells[0].text = idx  # Column Name
    row_cells[1].text = f"{row['Entropy (e_j)']:.6f}"
    row_cells[2].text = f"{row['Weight w_e1(j)']:.6f}"
    row_cells[3].text = f"{row['Weight w_e2(j)']:.6f}"
    row_cells[4].text = f"{row['Final Weight w_e(j)']:.6f}"

# Save the document
file_path = "Entropy_Weight_Results.docx"
doc.save(file_path)

file_path


'Entropy_Weight_Results.docx'

In [9]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'https://raw.githubusercontent.com/FaisalAbid11/Permutation-Entropy-vs-Modified-TOPSIS/refs/heads/main/Permutation-based-entropy/new%20neumerical.csv'  # Replace with your file path
data = pd.read_csv(file_path)
 # Drop irrelevant/low-importance columns
columns_to_drop = [
    ' satisfied with  work-life balance',
    'Age',
    ' work is meaningful ',
    'Education',
    'working hour',
    'Gender'
]
data = data.drop(columns=columns_to_drop)
# Extract the decision matrix (assuming all numerical columns after the first one)
decision_matrix = data.iloc[:, 1:].to_numpy()  # Adjust index based on column arrangement

# Updated weights for TOPSIS based on the new columns and final weights
weights = np.array([
    0.041601,  # satisfied compensation
    0.035481,  # satisfaction with workload
    0.035322,  # satisfied with job profession
    0.182293,  # mentally well and do not have anxity
    0.075492,  # family supports
    0.256367,  # satisfied with career and  opportunity
    0.046148,  # Work tenure
    0.256367,  # monthly average expenditure
    0.034650,  # good relationship with peers
    0.036278   # Job position
])
# Define the TOPSIS method
def topsis(decision_matrix, weights):
    # Normalize the decision matrix
    norm_factors = np.sqrt(np.sum(decision_matrix**2, axis=0))
    normalized_matrix = decision_matrix / norm_factors

    # Apply weights
    weighted_matrix = normalized_matrix * weights

    # Determine positive and negative ideals
    positive_ideal = np.max(weighted_matrix, axis=0)
    negative_ideal = np.min(weighted_matrix, axis=0)

    # Calculate distances to the positive and negative ideals
    distance_positive = np.sqrt(np.sum((weighted_matrix - positive_ideal) ** 2, axis=1))
    distance_negative = np.sqrt(np.sum((weighted_matrix - negative_ideal) ** 2, axis=1))

    # Calculate closeness coefficients
    closeness_coefficients = distance_negative / (distance_positive + distance_negative)
    return closeness_coefficients

# Compute TOPSIS closeness coefficients
closeness_coefficients = topsis(decision_matrix, weights)

# Categorize employees based on closeness coefficients
maxR = closeness_coefficients.max()
minR = closeness_coefficients.min()
Nclass = 3  # Number of categories
D = (maxR - minR) / Nclass

distressed_range = (minR, minR + D)
behavioral_range = (minR + D, minR + 2 * D)
enthusiastic_range = (minR + 2 * D, maxR)

# Assign categories based on the ranges
categories_named = []
for cc in closeness_coefficients:
    if distressed_range[0] <= cc <= distressed_range[1]:
        categories_named.append("Distressed")
    elif behavioral_range[0] < cc <= behavioral_range[1]:
        categories_named.append("Behavioral")
    elif enthusiastic_range[0] < cc <= enthusiastic_range[1]:
        categories_named.append("Enthusiastic")

# Add results to the original dataset
data['Closeness Coefficient'] = closeness_coefficients
data['Category'] = categories_named

# Save the updated dataset
output_path = 'updated_employee_categorization_new_weights.xlsx'
data.to_excel(output_path, index=False)

print(f"Updated dataset saved to {output_path}")


Updated dataset saved to updated_employee_categorization_new_weights.xlsx


In [10]:
import pandas as pd

# Load the uploaded dataset with categorization
file_path = 'updated_employee_categorization_new_weights.xlsx'
data = pd.read_excel(file_path)

# Split the data into three categories
distressed_data = data[data['Category'] == 'Distressed']
behavioral_data = data[data['Category'] == 'Behavioral']
enthusiastic_data = data[data['Category'] == 'Enthusiastic']

# Save each category into separate files
distressed_file = 'distressed_employees.xlsx'
behavioral_file = 'behavioral_employees.xlsx'
enthusiastic_file = 'enthusiastic_employees.xlsx'

distressed_data.to_excel(distressed_file, index=False)
behavioral_data.to_excel(behavioral_file, index=False)
enthusiastic_data.to_excel(enthusiastic_file, index=False)

distressed_file, behavioral_file, enthusiastic_file


('distressed_employees.xlsx',
 'behavioral_employees.xlsx',
 'enthusiastic_employees.xlsx')

In [11]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset
file_path = 'distressed_employees.xlsx'
data = pd.read_excel(file_path)

# Separate majority and minority classes
class_0 = data[data['TOI (turnover intention)'] == 0]
class_1 = data[data['TOI (turnover intention)'] == 1]

# Print class distribution before balancing
print("Class Distribution Before Balancing:")
print(data['TOI (turnover intention)'].value_counts())

# Target sample sizes
target_class_0 = 2764
target_class_1 = 2763

# Oversample Class 0
if len(class_0) > 0:
    class_0_balanced = resample(
        class_0,
        replace=True,
        n_samples=target_class_0,
        random_state=42
    )
else:
    print("Class 0 has zero samples. Skipping resampling.")
    class_0_balanced = pd.DataFrame()

# Oversample Class 1
if len(class_1) > 0:
    class_1_balanced = resample(
        class_1,
        replace=True,
        n_samples=target_class_1,
        random_state=42
    )
else:
    print("Class 1 has zero samples. Skipping resampling.")
    class_1_balanced = pd.DataFrame()

# Combine the two classes to form the balanced dataset
balanced_data = pd.concat([class_0_balanced, class_1_balanced])

# Shuffle the dataset to mix the classes
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a CSV file
output_path = 'balanced_dataset-distressed.csv'
balanced_data.to_csv(output_path, index=False)

# Display the new class distribution
print("Class Distribution After Balancing:")
print(balanced_data['TOI (turnover intention)'].value_counts())
print(f"Balanced dataset saved to {output_path}")


Class Distribution Before Balancing:
TOI (turnover intention)
1    121
0      9
Name: count, dtype: int64
Class Distribution After Balancing:
TOI (turnover intention)
0    2764
1    2763
Name: count, dtype: int64
Balanced dataset saved to balanced_dataset-distressed.csv


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from imblearn.under_sampling import RandomUnderSampler

# Load the balanced dataset
file_path = 'balanced_dataset-distressed.csv'  # Adjust file name if necessary
data = pd.read_csv(file_path)

# Separate features (X) and target (y)
X = data.drop(columns=['TOI (turnover intention)', ' Closeness Coefficient', 'Category'], errors='ignore')
y = data['TOI (turnover intention)']

# Add noise to the features
X_noisy = X + np.random.normal(0, 0.01, X.shape)  # Higher noise level

# Randomly flip some target labels to introduce noise
flip_fraction = 0.01  # Flip 10% of labels
indices = np.random.choice(y.index, size=int(len(y) * flip_fraction), replace=False)
y_noisy = y.copy()
y_noisy.loc[indices] = 1 - y.loc[indices]  # Flip labels

# Resample the dataset (optional)
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_noisy, y_noisy)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Initialize the Random Forest Classifier with reduced complexity
rf = RandomForestClassifier(
    n_estimators=200,           # Fewer trees
    max_depth=3,               # Limit depth
    min_samples_split=10,      # Larger minimum samples to split
    min_samples_leaf=10,       # Larger leaf size
    random_state=42
)

# Train the model on the training set
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Recall: {recall:.4f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"F1-Score: {f1:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.4f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[401 152]
 [ 35 518]]

Accuracy: 0.8309
Precision: 0.7731
Recall: 0.9367
F1-Score: 0.8471
MCC: 0.6772

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.73      0.81       553
           1       0.77      0.94      0.85       553

    accuracy                           0.83      1106
   macro avg       0.85      0.83      0.83      1106
weighted avg       0.85      0.83      0.83      1106



In [13]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset
file_path = 'behavioral_employees.xlsx'
data = pd.read_excel(file_path)

# Separate majority and minority classes
class_0 = data[data['TOI (turnover intention)'] == 0]
class_1 = data[data['TOI (turnover intention)'] == 1]

# Oversample both classes to the target count (2764 for Class 0 and 2763 for Class 1)
class_0_balanced = resample(class_0,
                            replace=True,     # Sample with replacement
                            n_samples=2764,   # Target number of samples
                            random_state=42)  # Reproducibility

class_1_balanced = resample(class_1,
                            replace=True,     # Sample with replacement
                            n_samples=2763,   # Target number of samples
                            random_state=42)  # Reproducibility

# Combine the two classes to form the balanced dataset
balanced_data = pd.concat([class_0_balanced, class_1_balanced])

# Shuffle the dataset to mix the classes
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a CSV file (optional)
balanced_data.to_csv('balanced_dataset-behavioral.csv', index=False)

# Display the new class distribution
print(balanced_data['TOI (turnover intention)'].value_counts())


TOI (turnover intention)
0    2764
1    2763
Name: count, dtype: int64


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from imblearn.under_sampling import RandomUnderSampler

# Load the balanced dataset
file_path = 'balanced_dataset-behavioral.csv'  # Adjust file name if necessary
data = pd.read_csv(file_path)

# Separate features (X) and target (y)
X = data.drop(columns=['TOI (turnover intention)', ' Closeness Coefficient', 'Category'], errors='ignore')
y = data['TOI (turnover intention)']

# Add noise to the features
X_noisy = X + np.random.normal(0, 0.01, X.shape)  # Higher noise level

# Randomly flip some target labels to introduce noise
flip_fraction = 0.01  # Flip 10% of labels
indices = np.random.choice(y.index, size=int(len(y) * flip_fraction), replace=False)
y_noisy = y.copy()
y_noisy.loc[indices] = 1 - y.loc[indices]  # Flip labels

# Resample the dataset (optional)
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_noisy, y_noisy)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Initialize the Random Forest Classifier with reduced complexity
rf = RandomForestClassifier(
    n_estimators=100,           # Fewer trees
    max_depth=5,               # Limit depth
    min_samples_split=20,      # Larger minimum samples to split
    min_samples_leaf=10,       # Larger leaf size
    random_state=42
)

# Train the model on the training set
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Recall: {recall:.4f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"F1-Score: {f1:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.4f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[538  13]
 [ 43 507]]

Accuracy: 0.9491
Precision: 0.9750
Recall: 0.9218
F1-Score: 0.9477
MCC: 0.8996

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95       551
           1       0.97      0.92      0.95       550

    accuracy                           0.95      1101
   macro avg       0.95      0.95      0.95      1101
weighted avg       0.95      0.95      0.95      1101



In [15]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset
file_path = 'enthusiastic_employees.xlsx'
data = pd.read_excel(file_path)

# Separate majority and minority classes
class_0 = data[data['TOI (turnover intention)'] == 0]
class_1 = data[data['TOI (turnover intention)'] == 1]

# Oversample both classes to the target count (2764 for Class 0 and 2763 for Class 1)
class_0_balanced = resample(class_0,
                            replace=True,     # Sample with replacement
                            n_samples=2764,   # Target number of samples
                            random_state=42)  # Reproducibility

class_1_balanced = resample(class_1,
                            replace=True,     # Sample with replacement
                            n_samples=2763,   # Target number of samples
                            random_state=42)  # Reproducibility

# Combine the two classes to form the balanced dataset
balanced_data = pd.concat([class_0_balanced, class_1_balanced])

# Shuffle the dataset to mix the classes
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a CSV file (optional)
balanced_data.to_csv('balanced_dataset-enthusiastic.csv', index=False)

# Display the new class distribution
print(balanced_data['TOI (turnover intention)'].value_counts())

TOI (turnover intention)
0    2764
1    2763
Name: count, dtype: int64


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from imblearn.under_sampling import RandomUnderSampler

# Load the balanced dataset
file_path = 'balanced_dataset-enthusiastic.csv'  # Adjust file name if necessary
data = pd.read_csv(file_path)

# Separate features (X) and target (y)
X = data.drop(columns=['TOI (turnover intention)', ' Closeness Coefficient', 'Category'], errors='ignore')
y = data['TOI (turnover intention)']

# Add noise to the features
X_noisy = X + np.random.normal(0, 0.01, X.shape)  # Higher noise level

# Randomly flip some target labels to introduce noise
flip_fraction = 0.01  # Flip 10% of labels
indices = np.random.choice(y.index, size=int(len(y) * flip_fraction), replace=False)
y_noisy = y.copy()
y_noisy.loc[indices] = 1 - y.loc[indices]  # Flip labels

# Resample the dataset (optional)
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_noisy, y_noisy)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Initialize the Random Forest Classifier with reduced complexity
rf = RandomForestClassifier(
    n_estimators=100,           # Fewer trees
    max_depth=5,               # Limit depth
    min_samples_split=20,      # Larger minimum samples to split
    min_samples_leaf=10,       # Larger leaf size
    random_state=42
)

# Train the model on the training set
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Recall: {recall:.4f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"F1-Score: {f1:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.4f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[505  48]
 [ 39 513]]

Accuracy: 0.9213
Precision: 0.9144
Recall: 0.9293
F1-Score: 0.9218
MCC: 0.8426

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       553
           1       0.91      0.93      0.92       552

    accuracy                           0.92      1105
   macro avg       0.92      0.92      0.92      1105
weighted avg       0.92      0.92      0.92      1105

