In [None]:
# import libs
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# load dataset
data_cleaned = pd.read_csv('/content/cleaned_dataset.csv')
X = data_cleaned.drop('Place', axis=1)  # features
y = data_cleaned['Place']  # target

# training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

# logistic regression
logmodel = LogisticRegression(solver='liblinear', random_state=0)

# genetic algorithm functions
def initilization_of_population(size, n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat, dtype=np.bool)
        chromosome[:int(0.3*n_feat)] = False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population

def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel.fit(X_train.iloc[:, chromosome], y_train)
        predictions = logmodel.predict(X_test.iloc[:, chromosome])
        scores.append(accuracy_score(y_test, predictions))
    scores, population = np.array(scores), np.array(population)
    inds = np.argsort(scores)
    return list(scores[inds][::-1]), list(population[inds, :][::-1])

def selection(pop_after_fit, n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen

def crossover(pop_after_sel):
    population_nextgen = pop_after_sel
    for i in range(len(pop_after_sel)):
        child = pop_after_sel[i]
        child[3:7] = pop_after_sel[(i+1)%len(pop_after_sel)][3:7]
        population_nextgen.append(child)
    return population_nextgen

def mutation(pop_after_cross, mutation_rate):
    population_nextgen = []
    for i in range(len(pop_after_cross)):
        chromosome = pop_after_cross[i]
        for j in range(len(chromosome)):
            if random.random() < mutation_rate:
                chromosome[j] = not chromosome[j]
        population_nextgen.append(chromosome)
    return population_nextgen

def generations(size, n_feat, n_parents, mutation_rate, n_gen, X_train, X_test, y_train, y_test):
    best_chromo = []
    best_score = []
    population_nextgen = initilization_of_population(size, n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print('Generation', i, ': Best score is', scores[0])
        pop_after_sel = selection(pop_after_fit, n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross, mutation_rate)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo, best_score

# start the genetic algorithm
chromo, score = generations(size=200, n_feat=X.shape[1], n_parents=100, mutation_rate=0.01,
                            n_gen=100, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

# fit the model with the best chromosome and test its performance
best_features_mask = chromo[-1]  # best chromosome
selected_features_indices = np.where(best_features_mask)[0]  # indices of selected features
selected_features = X.columns[selected_features_indices]  # map indices to feature names

# train and predict with selected features
logmodel.fit(X_train.iloc[:, selected_features_indices], y_train)
predictions = logmodel.predict(X_test.iloc[:, selected_features_indices])
print("Accuracy score after genetic algorithm is= "+str(accuracy_score(y_test, predictions)))

# output featurea names and the mask
print("Selected features by the genetic algorithm:", selected_features.tolist())
print("Feature selection mask:", best_features_mask)

ValueError: ignored

In [None]:
!pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.24-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pgmpy
Successfully installed pgmpy-0.1.24


In [None]:
import pandas as pd
import numpy as np
from pgmpy.estimators import HillClimbSearch, BicScore, MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix


# load cleaned dataset
data_cleaned = pd.read_csv('/content/cleaned_datasetnew.csv')

# features from the genetic algorithm
selected_features = [
    'family_history', 'work_interfere', 'benefits', 'wellness_program', 'anonymity',
    'phys_health_consequence', 'coworkers', 'supervisor', 'phys_health_interview',
    'obs_consequence', 'Age_cat'
]

# Split the data into training and test sets 70/30
X_train, X_test, y_train, y_test = train_test_split(
    data_cleaned[selected_features + ['treatment']], data_cleaned['treatment'],
    test_size=0.3, random_state=42
)

# learn the structure of the bayesian network with hill climb
hc = HillClimbSearch(X_train)
best_model_structure = hc.estimate(scoring_method=BicScore(X_train))
print("Learned structure:", best_model_structure.edges())

# initialise the bayesian betwork
bn_model = BayesianNetwork(best_model_structure.edges())
bn_model.fit(X_train, estimator=MaximumLikelihoodEstimator)

# inference
inference = VariableElimination(bn_model)

# only use nodes that are present in structure for inference
model_nodes = set(bn_model.nodes())

# predict on the test set using the learned bayesian network
y_pred_proba = []
y_pred = []

for _, row in X_test.iterrows():
    evidence = row.drop('treatment').to_dict()
    # filter to only include nodes that are present
    evidence = {node: evidence[node] for node in evidence if node in model_nodes}
    pred = inference.query(variables=['treatment'], evidence=evidence)
    y_pred_proba.append(pred.values[1])  # probability of 'treatment'=1
    y_pred.append(np.argmax(pred.values))  # 0 or 1

# result
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'Accuracy of the Bayesian Network model: {accuracy:.2f}')
print(f'ROC-AUC Score of the Bayesian Network model: {roc_auc:.2f}')
print(classification_report(y_test, y_pred))

# calculate and print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


  0%|          | 0/1000000 [00:00<?, ?it/s]

Learned structure: [('work_interfere', 'treatment'), ('benefits', 'anonymity'), ('benefits', 'wellness_program'), ('phys_health_consequence', 'phys_health_interview'), ('phys_health_consequence', 'obs_consequence'), ('coworkers', 'supervisor'), ('supervisor', 'phys_health_consequence'), ('treatment', 'family_history'), ('treatment', 'benefits')]
Accuracy of the Bayesian Network model: 0.80
ROC-AUC Score of the Bayesian Network model: 0.82
              precision    recall  f1-score   support

           0       0.78      0.58      0.66        97
           1       0.81      0.92      0.86       193

    accuracy                           0.80       290
   macro avg       0.79      0.75      0.76       290
weighted avg       0.80      0.80      0.79       290

Confusion Matrix:
[[ 56  41]
 [ 16 177]]


In [None]:
# import libs
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# load cleaned dataset
data_cleaned = pd.read_csv('/content/cleaned_datasetnew.csv')

# features from the genetic algorithm
selected_features = [
    'family_history', 'work_interfere', 'benefits', 'wellness_program', 'anonymity',
    'phys_health_consequence', 'coworkers', 'supervisor', 'phys_health_interview',
    'obs_consequence', 'Age_cat'
]

# split the data into features and labels
X = data_cleaned[selected_features]
y = data_cleaned['treatment']

# one hot encode the target
y = to_categorical(y)

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# make the features standard
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# create the neural network
model = Sequential()
model.add(Dense(16, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))  # output layer for binary classification

# compile
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
model.fit(X_train_scaled, y_train, epochs=30, batch_size=16, verbose=0)

# evaluate  model
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
y_pred_proba = model.predict(X_test_scaled)[:, 1]
y_pred = np.argmax(model.predict(X_test_scaled), axis=1)

# print score
print(f'Neural Network Model Accuracy: {accuracy:.2f}')
print(f'Neural Network Model ROC-AUC Score: {roc_auc_score(y_test[:, 1], y_pred_proba):.2f}')
print(classification_report(y_test.argmax(axis=1), y_pred))

# calculate and print the confusion matrix
cm = confusion_matrix(y_test.argmax(axis=1), y_pred)
print("Confusion Matrix:")
print(cm)


Neural Network Model Accuracy: 0.78
Neural Network Model ROC-AUC Score: 0.81
              precision    recall  f1-score   support

           0       0.69      0.61      0.65        97
           1       0.81      0.87      0.84       193

    accuracy                           0.78       290
   macro avg       0.75      0.74      0.74       290
weighted avg       0.77      0.78      0.78       290

Confusion Matrix:
[[ 59  38]
 [ 26 167]]
