In [15]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset
file_path = 'https://raw.githubusercontent.com/FaisalAbid11/Permutation-Entropy-vs-Modified-TOPSIS/refs/heads/main/AHP-MODIFIED-TOPSIS-EVALUATION/distressed.csv'
data = pd.read_csv(file_path)

# Separate majority and minority classes
class_0 = data[data['TOI'] == 0]
class_1 = data[data['TOI'] == 1]

# Oversample both classes to the target count (2764 for Class 0 and 2763 for Class 1)
class_0_balanced = resample(class_0,
                            replace=True,     # Sample with replacement
                            n_samples=2764,   # Target number of samples
                            random_state=42)  # Reproducibility

class_1_balanced = resample(class_1,
                            replace=True,     # Sample with replacement
                            n_samples=2763,   # Target number of samples
                            random_state=42)  # Reproducibility

# Combine the two classes to form the balanced dataset
balanced_data = pd.concat([class_0_balanced, class_1_balanced])

# Shuffle the dataset to mix the classes
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a CSV file (optional)
balanced_data.to_csv('balanced_dataset-modified-20.csv', index=False)

# Display the new class distribution
print(balanced_data['TOI'].value_counts())


TOI
0    2764
1    2763
Name: count, dtype: int64


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

## Load the dataset
file_path = 'balanced_dataset-modified-20.csv'  # Update with the correct path if needed
data = pd.read_csv(file_path)

# Separate features and target variable
X = data.drop(columns=['TOI'])  # Features
y = data['TOI']  # Target

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the Random Forest Classifier with reduced complexity
rf = RandomForestClassifier(
   n_estimators=100,           # Fewer trees
    max_depth=10,               # Limit depth
    min_samples_split=20,      # Larger minimum samples to split
    min_samples_leaf=20,       # Larger leaf size
    random_state=42
)

# Train the model on the training set
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Recall: {recall:.4f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"F1-Score: {f1:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.4f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance)

# Save feature importance to a CSV file
feature_importance.to_csv('feature_importance.csv', index=False)
print("Feature importance saved to 'feature_importance.csv'")


Confusion Matrix:
[[553   0]
 [ 13 540]]

Accuracy: 0.9882
Precision: 1.0000
Recall: 0.9765
F1-Score: 0.9881
MCC: 0.9768

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       553
           1       1.00      0.98      0.99       553

    accuracy                           0.99      1106
   macro avg       0.99      0.99      0.99      1106
weighted avg       0.99      0.99      0.99      1106


Feature Importances:
                                   Feature  Importance
7                             working hour    0.254539
9                      work is meaningful     0.145825
6   satisfied with career and  opportunity    0.128667
2   mentally well and do not have anxiety     0.108528
5                         family supports     0.094616
4                 satisfied  compensation     0.081118
8            good relationship with peers     0.075846
1        satisfied with  work-life balance    0.057259
3          

In [12]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset
file_path = 'https://raw.githubusercontent.com/FaisalAbid11/Permutation-Entropy-vs-Modified-TOPSIS/refs/heads/main/AHP-MODIFIED-TOPSIS-EVALUATION/behavioral.csv'
data = pd.read_csv(file_path)

# Separate majority and minority classes
class_0 = data[data['TOI'] == 0]
class_1 = data[data['TOI'] == 1]

# Oversample both classes to the target count (2764 for Class 0 and 2763 for Class 1)
class_0_balanced = resample(class_0,
                            replace=True,     # Sample with replacement
                            n_samples=2764,   # Target number of samples
                            random_state=42)  # Reproducibility

class_1_balanced = resample(class_1,
                            replace=True,     # Sample with replacement
                            n_samples=2763,   # Target number of samples
                            random_state=42)  # Reproducibility

# Combine the two classes to form the balanced dataset
balanced_data = pd.concat([class_0_balanced, class_1_balanced])

# Shuffle the dataset to mix the classes
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a CSV file (optional)
balanced_data.to_csv('balanced_dataset_modified-30.csv', index=False)

# Display the new class distribution
print(balanced_data['TOI'].value_counts())

TOI
0    2764
1    2763
Name: count, dtype: int64


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

## Load the dataset
file_path = 'balanced_dataset_modified-30.csv'  # Update with the correct path if needed
data = pd.read_csv(file_path)

# Separate features and target variable
X = data.drop(columns=['TOI'])  # Features
y = data['TOI']  # Target

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the Random Forest Classifier with reduced complexity
rf = RandomForestClassifier(
   n_estimators=100,           # Fewer trees
    max_depth=10,               # Limit depth
    min_samples_split=20,      # Larger minimum samples to split
    min_samples_leaf=20,       # Larger leaf size
    random_state=42
)

# Train the model on the training set
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Recall: {recall:.4f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"F1-Score: {f1:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.4f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance)

# Save feature importance to a CSV file
feature_importance.to_csv('feature_importance.csv', index=False)
print("Feature importance saved to 'feature_importance.csv'")

Confusion Matrix:
[[535  18]
 [ 30 523]]

Accuracy: 0.9566
Precision: 0.9667
Recall: 0.9458
F1-Score: 0.9561
MCC: 0.9134

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       553
           1       0.97      0.95      0.96       553

    accuracy                           0.96      1106
   macro avg       0.96      0.96      0.96      1106
weighted avg       0.96      0.96      0.96      1106


Feature Importances:
                                   Feature  Importance
1        satisfied with  work-life balance    0.169405
7                             working hour    0.113097
4                 satisfied  compensation     0.111073
2   mentally well and do not have anxiety     0.109101
0               satisfaction with workload    0.101017
6   satisfied with career and  opportunity    0.100761
8            good relationship with peers     0.094142
5                         family supports     0.077942
9          

In [16]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset
file_path = 'https://raw.githubusercontent.com/FaisalAbid11/Permutation-Entropy-vs-Modified-TOPSIS/refs/heads/main/AHP-MODIFIED-TOPSIS-EVALUATION/enthusiastic.csv'
data = pd.read_csv(file_path)

# Separate majority and minority classes
class_0 = data[data['TOI'] == 0]
class_1 = data[data['TOI'] == 1]

# Oversample both classes to the target count (2764 for Class 0 and 2763 for Class 1)
class_0_balanced = resample(class_0,
                            replace=True,     # Sample with replacement
                            n_samples=2764,   # Target number of samples
                            random_state=42)  # Reproducibility

class_1_balanced = resample(class_1,
                            replace=True,     # Sample with replacement
                            n_samples=2763,   # Target number of samples
                            random_state=42)  # Reproducibility

# Combine the two classes to form the balanced dataset
balanced_data = pd.concat([class_0_balanced, class_1_balanced])

# Shuffle the dataset to mix the classes
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a CSV file (optional)
balanced_data.to_csv('balanced_dataset-modified-50.csv', index=False)

# Display the new class distribution
print(balanced_data['TOI'].value_counts())


TOI
0    2764
1    2763
Name: count, dtype: int64


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

## Load the dataset
file_path = 'balanced_dataset-modified-50.csv'  # Update with the correct path if needed
data = pd.read_csv(file_path)

# Separate features and target variable
X = data.drop(columns=['TOI'])  # Features
y = data['TOI']  # Target

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the Random Forest Classifier with reduced complexity
rf = RandomForestClassifier(
   n_estimators=100,           # Fewer trees
    max_depth=20,               # Limit depth
    min_samples_split=10,      # Larger minimum samples to split
    min_samples_leaf=10,       # Larger leaf size
    random_state=42
)

# Train the model on the training set
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred, average='binary')
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average='binary')
print(f"Recall: {recall:.4f}")

# F1-Score
f1 = f1_score(y_test, y_pred, average='binary')
print(f"F1-Score: {f1:.4f}")

# Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.4f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance)

# Save feature importance to a CSV file
feature_importance.to_csv('feature_importance.csv', index=False)
print("Feature importance saved to 'feature_importance.csv'")

Confusion Matrix:
[[553   0]
 [ 18 535]]

Accuracy: 0.9837
Precision: 1.0000
Recall: 0.9675
F1-Score: 0.9835
MCC: 0.9680

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       553
           1       1.00      0.97      0.98       553

    accuracy                           0.98      1106
   macro avg       0.98      0.98      0.98      1106
weighted avg       0.98      0.98      0.98      1106


Feature Importances:
                                   Feature  Importance
0               satisfaction with workload    0.289388
4                 satisfied  compensation     0.141941
1        satisfied with  work-life balance    0.132315
2   mentally well and do not have anxiety     0.125013
3           satisfied with job profession     0.072487
6   satisfied with career and  opportunity    0.061776
7                             working hour    0.055250
8            good relationship with peers     0.050855
5          