<H1>Filter Method</H1>

<H2>Variance Threshold</H2>

In [31]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
import numpy as np

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Variance Threshold Function
def variance_threshold(X, threshold=0.1):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return X.columns[selector.get_support()]

# Apply Variance Threshold
selected_features = variance_threshold(X)

# Print the features selected by Variance Threshold
print("Features Selected by Variance Threshold:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Variance Threshold:
Breathing Problem
Sore throat
Running Nose
Asthma
Chronic Lung Disease
Headache
Heart Disease
Diabetes
Hyper Tension
Fatigue 
Gastrointestinal 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Visited Public Exposed Places
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9897


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9894


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9880


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9894
Average Accuracy across 4 folds: 0.9892


<H2>Chi Squared Test</H2>

In [33]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
import numpy as np

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Chi-Squared Feature Selection
def chi_squared_selection(X, y, k=10):
    # Select the top k features based on the Chi-Squared test
    selector = SelectKBest(score_func=chi2, k=k)
    X_new = selector.fit_transform(X, y)
    return X.columns[selector.get_support()]

# Apply Chi-Squared Feature Selection
selected_features = chi_squared_selection(X, y, k=10)  # Adjust k as needed

# Print the features selected by Chi-Squared test
print("Features Selected by Chi-Squared Test:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Chi-Squared Test:
Breathing Problem
Sore throat
Heart Disease
Diabetes
Hyper Tension
Gastrointestinal 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9867


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9867


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9808


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9866
Average Accuracy across 4 folds: 0.9852


<H1>Mutual Information</H1>

In [35]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
import numpy as np

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Mutual Information Feature Selection
def mutual_information_selection(X, y, k=10):
    # Select the top k features based on Mutual Information
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_new = selector.fit_transform(X, y)
    return X.columns[selector.get_support()]

# Apply Mutual Information Feature Selection
selected_features = mutual_information_selection(X, y, k=10)  # Adjust k as needed

# Print the features selected by Mutual Information
print("Features Selected by Mutual Information:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Mutual Information:
Breathing Problem
Sore throat
Heart Disease
Diabetes
Hyper Tension
Gastrointestinal 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9869


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9836


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9812


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9872
Average Accuracy across 4 folds: 0.9847


In [37]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
import numpy as np

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Information Gain Feature Selection (using Mutual Information)
def information_gain_selection(X, y, k=10):
    # Select the top k features based on Information Gain (Mutual Information)
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    X_new = selector.fit_transform(X, y)
    return X.columns[selector.get_support()]

# Apply Information Gain Feature Selection
selected_features = information_gain_selection(X, y, k=10)  # Adjust k as needed

# Print the features selected by Information Gain
print("Features Selected by Information Gain:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Information Gain:
Breathing Problem
Sore throat
Running Nose
Heart Disease
Diabetes
Gastrointestinal 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9804


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9805


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9829


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9823
Average Accuracy across 4 folds: 0.9815


<H1>Wrapper</H1>

<H2>Backward Elimination</H2>

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
import statsmodels.api as sm

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Backward Elimination Function
def backward_elimination(X, y, threshold_in=0.05):
    X_with_const = sm.add_constant(X)  # Add constant term for intercept
    model = sm.OLS(y, X_with_const).fit()  # Fit the model
    p_values = model.pvalues  # Get p-values
    while p_values.max() > threshold_in:  # While the max p-value is greater than the threshold
        remove_feature = p_values.idxmax()  # Get the feature with the highest p-value
        X = X.drop(columns=remove_feature)  # Drop the feature
        X_with_const = sm.add_constant(X)  # Re-add constant term
        model = sm.OLS(y, X_with_const).fit()  # Fit the model again
        p_values = model.pvalues  # Update p-values
    return X.columns.tolist()  # Return the remaining features

# Apply Backward Elimination
selected_features = backward_elimination(X, y, threshold_in=0.05)  # Adjust threshold as needed

# Print the features selected by Backward Elimination
print("Features Selected by Backward Elimination:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Backward Elimination:
Breathing Problem
Fever
Dry Cough
Sore throat
Running Nose
Headache
Heart Disease
Diabetes
Hyper Tension
Fatigue 
Gastrointestinal 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Visited Public Exposed Places
Family working in Public Exposed Places
Wearing Masks
Sanitization from Market


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9860


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9833


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9837


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9868
Average Accuracy across 4 folds: 0.9849


In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
import statsmodels.api as sm

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Forward Selection Function
def forward_selection(X, y, threshold_in=0.05):
    selected_features = []
    remaining_features = list(X.columns)
    
    while remaining_features:
        best_feature = None
        best_p_value = float('inf')
        
        for feature in remaining_features:
            # Fit the model with the selected features plus the candidate feature
            X_temp = X[selected_features + [feature]]
            X_temp_with_const = sm.add_constant(X_temp)  # Add constant term for intercept
            model = sm.OLS(y, X_temp_with_const).fit()  # Fit the model
            p_value = model.pvalues[feature]  # Get p-value for the candidate feature
            
            if p_value < best_p_value:
                best_p_value = p_value
                best_feature = feature
        
        if best_p_value < threshold_in:  # If the best feature is significant
            selected_features.append(best_feature)  # Add it to the selected features
            remaining_features.remove(best_feature)  # Remove it from the remaining features
        else:
            break  # Stop if no more significant features can be added
    
    return selected_features  # Return the selected features

# Apply Forward Selection
selected_features = forward_selection(X, y, threshold_in=0.05)  # Adjust threshold as needed

# Print the features selected by Forward Selection
print("Features Selected by Forward Selection:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Forward Selection:
Breathing Problem
Sore throat
Abroad travel
Attended Large Gathering
Contact with COVID Patient
Family working in Public Exposed Places
Headache
Fever
Dry Cough
Fatigue 
Visited Public Exposed Places
Running Nose
Hyper Tension
Diabetes
Gastrointestinal 
Heart Disease


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9836


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9861


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9840


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9818
Average Accuracy across 4 folds: 0.9839


In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# RFE Feature Selection
def rfe_selection(X, y, n_features_to_select=10):
    model = LogisticRegression(max_iter=1000)  # Logistic Regression model
    rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)  # RFE with the model
    rfe.fit(X, y)  # Fit RFE
    return X.columns[rfe.support_].tolist()  # Return selected features

# Apply RFE Feature Selection
selected_features = rfe_selection(X, y, n_features_to_select=10)  # Adjust number of features as needed

# Print the features selected by RFE
print("Features Selected by RFE:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by RFE:
Breathing Problem
Sore throat
Chronic Lung Disease
Hyper Tension
Fatigue 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Visited Public Exposed Places
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9830


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9797


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9855


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9861
Average Accuracy across 4 folds: 0.9836


<H1>Embedded</H1>

In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Lasso Feature Selection
def lasso_selection(X, y, alpha=0.01):
    model = Lasso(alpha=alpha)  # Lasso model with specified alpha
    model.fit(X, y)  # Fit the model
    selected_features = X.columns[model.coef_ != 0]  # Select features with non-zero coefficients
    return selected_features.tolist()  # Return selected features

# Apply Lasso Feature Selection
selected_features = lasso_selection(X, y, alpha=0.01)  # Adjust alpha as needed

# Print the features selected by Lasso
print("Features Selected by Lasso:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Lasso:
Breathing Problem
Sore throat
Fatigue 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Visited Public Exposed Places
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9744


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9805


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9746


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9833
Average Accuracy across 4 folds: 0.9782


In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Ridge Feature Selection
def ridge_selection(X, y, alpha=1.0, threshold=0.01):
    model = Ridge(alpha=alpha)  # Ridge model with specified alpha
    model.fit(X, y)  # Fit the model
    # Select features with coefficients above the threshold
    selected_features = X.columns[np.abs(model.coef_) > threshold]  
    return selected_features.tolist()  # Return selected features

# Apply Ridge Feature Selection
selected_features = ridge_selection(X, y, alpha=1.0, threshold=0.01)  # Adjust alpha and threshold as needed

# Print the features selected by Ridge
print("Features Selected by Ridge:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Ridge:
Breathing Problem
Sore throat
Running Nose
Headache
Heart Disease
Diabetes
Hyper Tension
Fatigue 
Gastrointestinal 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Visited Public Exposed Places
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9890


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9862


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9880


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9890
Average Accuracy across 4 folds: 0.9881


In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import ElasticNet
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Elastic Net Feature Selection
def elastic_net_selection(X, y, alpha=1.0, l1_ratio=0.5, threshold=0.01):
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)  # Elastic Net model with specified alpha and l1_ratio
    model.fit(X, y)  # Fit the model
    # Select features with coefficients above the threshold
    selected_features = X.columns[np.abs(model.coef_) > threshold]  
    return selected_features.tolist()  # Return selected features

# Apply Elastic Net Feature Selection
selected_features = elastic_net_selection(X, y, alpha=1.0, l1_ratio=0.5, threshold=0.01)  # Adjust alpha, l1_ratio, and threshold as needed

# Print the features selected by Elastic Net
print("Features Selected by Elastic Net:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Elastic Net:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Computed output size would be negative. Received `inputs shape=(None, 0, 1)`, `kernel shape=(3, 1, 64)`, `dilation_rate=[1]`.

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Tree-Based Feature Selection
def tree_based_selection(X, y, n_estimators=100, threshold=0.01):
    model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)  # Random Forest model
    model.fit(X, y)  # Fit the model
    importances = model.feature_importances_  # Get feature importances
    # Select features with importances above the threshold
    selected_features = X.columns[importances > threshold]  
    return selected_features.tolist()  # Return selected features

# Apply Tree-Based Feature Selection
selected_features = tree_based_selection(X, y, n_estimators=100, threshold=0.01)  # Adjust n_estimators and threshold as needed

# Print the features selected by Tree-Based method
print("Features Selected by Tree-Based Method:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Tree-Based Method:
Breathing Problem
Sore throat
Running Nose
Asthma
Chronic Lung Disease
Headache
Heart Disease
Diabetes
Hyper Tension
Fatigue 
Gastrointestinal 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Visited Public Exposed Places
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9890


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9876


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9887


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9914
Average Accuracy across 4 folds: 0.9892


In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import Ridge
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Hybrid Feature Selection Function
def hybrid_feature_selection(X, y, n_features_to_select=5):
    # Step 1: Variance Threshold
    variance_threshold = VarianceThreshold(threshold=0.1)  # Adjust threshold as needed
    X_var = variance_threshold.fit_transform(X)
    var_features = X.columns[variance_threshold.get_support()]

    # Step 2: Ridge Regression
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X[var_features], y)
    ridge_features = var_features[np.abs(ridge_model.coef_) > 0.01]  # Adjust threshold as needed

    # Step 3: Tree-Based Feature Selection
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X[ridge_features], y)
    tree_features = ridge_features[rf_model.feature_importances_ > 0.01]  # Adjust threshold as needed

    # Consolidate selected features
    selected_features = tree_features[:n_features_to_select]  # Ensure we only return the top n features

    return selected_features

# Apply Hybrid Feature Selection
selected_features = hybrid_feature_selection(X, y, n_features_to_select=5)  # Specify the number of features to select

# Print the features selected by Hybrid method
print("Features Selected by Hybrid Method:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Fit the model with early stopping
    model.fit(X_train, y_train, epochs=1000, batch_size=32, validation_split=0.2, callbacks=[early_stop], verbose=0)  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Hybrid Method:
Breathing Problem
Sore throat
Running Nose
Headache
Heart Disease


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9372


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9312


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9352


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9303
Average Accuracy across 4 folds: 0.9335


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Hybrid Feature Selection Function
def hybrid_feature_selection(X, y):
    # Step 1: Variance Threshold
    variance_threshold = VarianceThreshold(threshold=0.1)  # Adjust threshold as needed
    X_var = variance_threshold.fit_transform(X)
    var_features = set(X.columns[variance_threshold.get_support()])

    # Convert var_features to a list for indexing
    var_features_list = list(var_features)

    # Step 2: Ridge Regression
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X[var_features_list], y)  # Use the list for indexing
    
    # Create a boolean mask for features with significant coefficients
    significant_mask = np.abs(ridge_model.coef_) > 0.01  # Adjust threshold as needed
    ridge_features = set(np.array(var_features_list)[significant_mask])  # Use the mask to filter features

    # Step 3: Tree-Based Feature Selection
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X[list(ridge_features)], y)  # Convert set to list for indexing
    tree_features = set(np.array(list(ridge_features))[rf_model.feature_importances_ > 0.01])  # Adjust threshold as needed

    # Intersection of all selected features
    selected_features = var_features | tree_features 

    return list(selected_features)  # Return all selected features based on intersection

# Apply Hybrid Feature Selection
selected_features = hybrid_feature_selection(X, y)  # No limit on the number of features

# Print the features selected by Hybrid method
print("Features Selected by Hybrid Method:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Fit the model with early stopping
    model.fit(X_train, y_train, epochs=1000, batch_size=32, validation_split=0.2, callbacks=[early_stop], verbose=0)  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Hybrid Method:
Breathing Problem
Fatigue 
Contact with COVID Patient
Asthma
Gastrointestinal 
Visited Public Exposed Places
Diabetes
Headache
Attended Large Gathering
Heart Disease
Abroad travel
Hyper Tension
Running Nose
Family working in Public Exposed Places
Sore throat
Chronic Lung Disease


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Fold Accuracy: 0.9848


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Fold Accuracy: 0.9869


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Fold Accuracy: 0.9733


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Fold Accuracy: 0.9726
Average Accuracy across 4 folds: 0.9794


In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Hybrid Feature Selection Function
def hybrid_feature_selection(X, y):
    # Step 1: Variance Threshold
    variance_threshold = VarianceThreshold(threshold=0.1)
    X_var = variance_threshold.fit_transform(X)
    var_features = X.columns[variance_threshold.get_support()]

    # Step 2: Univariate Feature Selection
    chi2_selector = SelectKBest(chi2, k=10)  # Adjust k as needed
    X_chi2 = chi2_selector.fit_transform(X[var_features], y)
    chi2_features = var_features[chi2_selector.get_support()]

    # Step 3: Recursive Feature Elimination (RFE)
    rfe_model = LogisticRegression(max_iter=1000)
    rfe = RFE(estimator=rfe_model, n_features_to_select=10)  # Adjust n_features_to_select as needed
    rfe.fit(X[chi2_features], y)
    rfe_features = chi2_features[rfe.support_]

    # Step 4: Tree-Based Feature Importance
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X[rfe_features], y)
    tree_features = rfe_features[rf_model.feature_importances_ > 0.01]  # Adjust threshold as needed

    # Step 5: L1 Regularization (Lasso)
    lasso_model = Lasso(alpha=0.01)
    lasso_model.fit(X[tree_features], y)
    lasso_features = tree_features[np.abs(lasso_model.coef_) > 0.01]  # Adjust threshold as needed

    return list(lasso_features)

# Apply Hybrid Feature Selection
selected_features = hybrid_feature_selection(X, y)

# Print the features selected by Hybrid method
print("Features Selected by Hybrid Method:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Fit the model with early stopping
    model.fit(X_train, y_train, epochs=1000, batch_size=32, validation_split=0.2, callbacks=[early_stop], verbose=0)  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Features Selected by Hybrid Method:
Breathing Problem
Sore throat
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Family working in Public Exposed Places


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9718


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9714


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9701


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9714
Average Accuracy across 4 folds: 0.9711


In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# PCA Dimensionality Reduction
def pca_selection(X, n_components=10):
    pca = PCA(n_components=n_components)  # Specify the number of components
    X_pca = pca.fit_transform(X)  # Fit and transform the data
    return X_pca  # Return the transformed data

# Apply PCA Dimensionality Reduction
X_selected = pca_selection(X, n_components=10)  # Adjust n_components as needed

# Prepare Data for 1D CNN
X_selected = X_selected.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9944


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9894


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9892


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9894
Average Accuracy across 4 folds: 0.9906


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Autoencoder for Dimensionality Reduction
def build_autoencoder(input_dim, encoding_dim):
    # Define the autoencoder model
    model = Sequential()
    model.add(Dense(encoding_dim, activation='relu', input_shape=(input_dim,)))  # Encoder
    model.add(Dense(input_dim, activation='sigmoid'))  # Decoder
    return model

# Set parameters
input_dim = X.shape[1]  # Number of features
encoding_dim = 10  # Desired dimensionality after reduction

# Build and compile the autoencoder
autoencoder = build_autoencoder(input_dim, encoding_dim)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Fit the autoencoder
autoencoder.fit(X, X, epochs=100, batch_size=32, verbose=0)  # Train on the same data

# Transform the data using the encoder part of the autoencoder
encoder = Sequential()
encoder.add(Dense(encoding_dim, activation='relu', input_shape=(input_dim,)))  # Encoder layer
encoder.layers[0].set_weights(autoencoder.layers[0].get_weights())  # Copy weights from the autoencoder encoder
# No need to set weights for the decoder layer

# Transform the data
X_selected = encoder.predict(X)

# Prepare Data for 1D CNN
X_selected = X_selected.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m156/899[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 649us/step

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 962us/step
Fold Accuracy: 0.9924


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 959us/step
Fold Accuracy: 0.9925


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Fold Accuracy: 0.9882


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 945us/step
Fold Accuracy: 0.9904
Average Accuracy across 4 folds: 0.9909


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense
from keras.callbacks import EarlyStopping

# Placeholder for HLLE function
def hlle_selection(X, n_components=10, n_neighbors=5):
    # Implement HLLE here or use a library that provides it
    # For now, we will just return the input as a placeholder
    # Replace this with actual HLLE implementation
    from sklearn.manifold import LocallyLinearEmbedding
    lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=n_neighbors, method='standard')
    return lle.fit_transform(X)

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Drop the 'source' feature if it exists
# df = df.drop(columns=['data_source'], errors='ignore')

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Apply HLLE Dimensionality Reduction
X_selected = hlle_selection(X, n_components=10, n_neighbors=5)  # Adjust n_components and n_neighbors as needed

# Prepare Data for 1D CNN
X_selected = X_selected.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)

accuracies = []

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

for train_index, test_index in kf.split(X_selected):
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_selected.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])  # Adjust epochs and batch size as needed

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

In [10]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

# Load your dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['COVID-19']).values
y = df['COVID-19'].values

# Step 1: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Step 3: Build Autoencoder
def create_autoencoder(input_dim):
    model = Sequential()
    model.add(Dense(256, activation='relu', input_shape=(input_dim,)))  # Increased neurons
    model.add(Dropout(0.2))  # Dropout layer
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))  # Dropout layer
    model.add(Dense(64, activation='relu'))  # Bottleneck layer
    model.add(Dense(128, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(input_dim, activation='sigmoid'))  # Output layer
    return model

# Step 4: Train the Autoencoder
autoencoder = create_autoencoder(X_pca.shape[1])
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

# Fit the Autoencoder
autoencoder.fit(X_pca, X_pca, epochs=100, batch_size=256, shuffle=True, validation_split=0.2, callbacks=[early_stop])

# Step 5: Use the encoder part of the Autoencoder for further dimensionality reduction
encoder = Sequential()
encoder.add(Dense(256, activation='relu', input_shape=(X_pca.shape[1],)))  # Increased neurons
encoder.add(Dropout(0.2))  # Dropout layer
encoder.add(Dense(128, activation='relu'))
encoder.add(Dropout(0.2))  # Dropout layer
encoder.add(Dense(64, activation='relu'))  # Final reduced representation

# Get the reduced features
X_reduced = encoder.predict(X_pca)

# Step 6: K-Fold Cross-Validation for Model Evaluation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X_reduced):
    X_train, X_test = X_reduced[train_index], X_reduced[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the Classifier Model
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(0.2))  # Dropout layer
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 1.1245 - val_loss: 0.8096
Epoch 2/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.8109 - val_loss: 0.7926
Epoch 3/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7803 - val_loss: 0.7841
Epoch 4/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7641 - val_loss: 0.7789
Epoch 5/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7570 - val_loss: 0.7781
Epoch 6/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7598 - val_loss: 0.7781
Epoch 7/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7594 - val_loss: 0.7775
Epoch 8/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7557 - val_loss: 0.7770
Epoch 9/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step
Fold Accuracy: 0.9936


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step
Fold Accuracy: 0.9946


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step
Fold Accuracy: 0.9950
Average Accuracy across 4 folds: 0.9946


In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

# Load your dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['COVID-19']).values
y = df['COVID-19'].values

# Step 1: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)  # Reduce to 2 dimensions for visualization
X_tsne = tsne.fit_transform(X_scaled)

# Step 3: K-Fold Cross-Validation for Model Evaluation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X_tsne):
    X_train, X_test = X_tsne[train_index], X_tsne[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the Classifier Model
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))  # Adjusted neurons
    model.add(Dropout(0.3))  # Dropout layer
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Early Stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

    model.fit(X_train, y_train, epochs=120, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 613us/step
Fold Accuracy: 0.9481


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


KeyboardInterrupt: 

In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.callbacks import EarlyStopping
import keras.backend as K

# Load your dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['COVID-19']).values
y = df['COVID-19'].values

# Step 1: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Create Contrastive Loss Function
def contrastive_loss(y_true, y_pred):
    margin = 1.0
    square_pred = K.square(y_pred)
    square_margin = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * square_margin)

# Step 3: Build Contrastive Autoencoder
def create_contrastive_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    x = Dense(256, activation='relu')(input_layer)
    x = Dropout(0.3)(x)
    x = Dense(128, activation='relu')(x)
    encoded = Dense(64, activation='relu')(x)  # Bottleneck layer

    # Decoder
    x = Dense(128, activation='relu')(encoded)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    decoded = Dense(input_dim, activation='sigmoid')(x)

    model = Model(inputs=input_layer, outputs=decoded)
    return model

# Step 4: Train the Contrastive Autoencoder
autoencoder = create_contrastive_autoencoder(X_scaled.shape[1])
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

# Fit the Autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=120, batch_size=256, shuffle=True, validation_split=0.2, callbacks=[early_stop])

# Step 5: Use the encoder part of the Autoencoder for further dimensionality reduction
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[2].output)  # Get the encoder part

# Get the reduced features
X_reduced = encoder.predict(X_scaled)

# Step 6: K-Fold Cross-Validation for Model Evaluation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X_reduced):
    X_train, X_test = X_reduced[train_index], X_reduced[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the Classifier Model
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))  # Adjusted neurons
    model.add(Dropout(0.3))  # Dropout layer
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=120, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Epoch 1/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.8009 - val_loss: 0.3801
Epoch 2/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.4825 - val_loss: 0.3571
Epoch 3/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.4442 - val_loss: 0.3541
Epoch 4/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.4348 - val_loss: 0.3519
Epoch 5/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.4300 - val_loss: 0.3512
Epoch 6/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.4288 - val_loss: 0.3506
Epoch 7/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.4231 - val_loss: 0.3488
Epoch 8/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.4205 - val_loss: 0.3484
Epoch 9/120
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 701us/step
Fold Accuracy: 0.9953


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 684us/step
Fold Accuracy: 0.9937


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 705us/step
Fold Accuracy: 0.9943


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 735us/step
Fold Accuracy: 0.9947
Average Accuracy across 4 folds: 0.9945


In [22]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, Conv1D, Flatten
from keras.callbacks import EarlyStopping
import keras.backend as K

# Load your dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['COVID-19']).values
y = df['COVID-19'].values

# Step 1: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Step 3: Create Contrastive Loss Function
def contrastive_loss(y_true, y_pred):
    margin = 1.0
    square_pred = K.square(y_pred)
    square_margin = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * square_margin)

# Step 4: Build Contrastive Autoencoder
def create_contrastive_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    x = Dense(256, activation='relu')(input_layer)
    x = Dropout(0.3)(x)
    x = Dense(128, activation='relu')(x)
    encoded = Dense(64, activation='relu')(x)  # Bottleneck layer

    # Decoder
    x = Dense(128, activation='relu')(encoded)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    decoded = Dense(input_dim, activation='sigmoid')(x)

    model = Model(inputs=input_layer, outputs=decoded)
    return model

# Step 5: Train the Contrastive Autoencoder
autoencoder = create_contrastive_autoencoder(X_pca.shape[1])
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

# Fit the Autoencoder
autoencoder.fit(X_pca, X_pca, epochs=100, batch_size=256, shuffle=True, validation_split=0.2, callbacks=[early_stop])

# Step 6: Use the encoder part of the Autoencoder for further dimensionality reduction
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[2].output)  # Get the encoder part

# Get the reduced features
X_reduced = encoder.predict(X_pca)

# Reshape the data for 1D CNN
X_reduced = X_reduced.reshape(X_reduced.shape[0], X_reduced.shape[1], 1)  # Reshape for 1D CNN

# Step 7: K-Fold Cross-Validation for Model Evaluation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X_reduced):
    X_train, X_test = X_reduced[train_index], X_reduced[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and Train the 1D CNN Model
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))  # 1D CNN layer
    model.add(Dropout(0.3))  # Dropout layer
    model.add(Flatten())  # Flatten the output
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))  # Dropout layer
    model.add(Dense(1, activation='sigmoid'))  # Adjust output layer based on your problem

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Early Stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')

    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, callbacks=[early_stop])

    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions

    # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred_classes)
    accuracies.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')

# Calculate and print the average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f'Average Accuracy across {k} folds: {average_accuracy:.4f}')

Epoch 1/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 1.1292 - val_loss: 0.8297
Epoch 2/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.8418 - val_loss: 0.7986
Epoch 3/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7935 - val_loss: 0.7882
Epoch 4/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7779 - val_loss: 0.7817
Epoch 5/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7698 - val_loss: 0.7799
Epoch 6/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7621 - val_loss: 0.7793
Epoch 7/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7589 - val_loss: 0.7788
Epoch 8/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7586 - val_loss: 0.7782
Epoch 9/100
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9951


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9940


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9946


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Fold Accuracy: 0.9950
Average Accuracy across 4 folds: 0.9947


<H1>Final Attempt</H1>

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense, Dropout, Input
from keras.callbacks import EarlyStopping
from keras import backend as K

# Load your dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['COVID-19']).values
y = df['COVID-19'].values
# Step 1: Feature Selection
# Variance Threshold
var_thresh = VarianceThreshold(threshold=0.1)
X_var = var_thresh.fit_transform(X)

# Tree-based feature selection
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_var, y)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Select top features based on importance
n_top_features = 10  # Adjust as needed
top_features = indices[:n_top_features]
X_selected = X_var[:, top_features]

# Print selected features
print("Selected features from variance threshold and tree-based selection:", top_features)

# Step 2: Dimensionality Reduction
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# PCA
pca = PCA(n_components=5)  # Adjust number of components as needed
X_pca = pca.fit_transform(X_scaled)

# Autoencoder
input_dim = X_pca.shape[1]
autoencoder = Sequential()
autoencoder.add(Input(shape=(input_dim,)))
autoencoder.add(Dense(3, activation='relu'))  # Bottleneck layer
autoencoder.add(Dense(input_dim, activation='sigmoid'))  # Output layer
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_pca, X_pca, epochs=50, batch_size=32, shuffle=True, validation_split=0.2)

# Get encoded features
encoder = Sequential()
encoder.add(Input(shape=(input_dim,)))
encoder.add(Dense(3, activation='relu'))  # Bottleneck layer
encoded_features = encoder.predict(X_pca)

# Print encoded features
print("Encoded features from Autoencoder:", encoded_features)

# Step 3: 1D CNN with k-fold validation
kf = KFold(n_splits=4, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(encoded_features):
    X_train, X_test = encoded_features[train_index], encoded_features[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Reshape for 1D CNN
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    # Build 1D CNN model
    model = Sequential()
    model.add(Conv1D(32, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)
    accuracies.append(accuracy)

# Print average accuracy across folds
print("Average accuracy across folds:", np.mean(accuracies))

Selected features from variance threshold and tree-based selection: [ 1  0 11 13 15 12  6  9  7  8]
Epoch 1/50
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 1.7511 - val_loss: 1.5385
Epoch 2/50
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 1.3645 - val_loss: 1.4063
Epoch 3/50
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.2388 - val_loss: 1.3303
Epoch 4/50
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.1864 - val_loss: 1.2858
Epoch 5/50
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1.1460 - val_loss: 1.2583
Epoch 6/50
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1.1161 - val_loss: 1.2405
Epoch 7/50
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1.1065 - val_loss: 1.2290
Epoch 8/50
[1m720/720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7778 - loss: 0.4563 - val_accuracy: 0.9625 - val_loss: 0.1447
Epoch 2/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8778 - loss: 0.2955 - val_accuracy: 0.9625 - val_loss: 0.1088
Epoch 3/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8684 - loss: 0.2765 - val_accuracy: 0.9622 - val_loss: 0.0951
Epoch 4/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8723 - loss: 0.2659 - val_accuracy: 0.9625 - val_loss: 0.0888
Epoch 5/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8795 - loss: 0.2567 - val_accuracy: 0.9625 - val_loss: 0.0878
Epoch 6/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8721 - loss: 0.2587 - val_accuracy: 0.9625 - val_loss: 0.0816
Epoch 7/100
[1m540/540[0m [32m━

# Claude Improveed CNN Architecture Variant

In [7]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Conv1D, Flatten, Dense, MaxPooling1D, Dropout, BatchNormalization
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from keras.callbacks import EarlyStopping
import numpy as np

# Load dataset
file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with the correct path to your dataset
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['COVID-19'])
y = df['COVID-19']

# Variance Threshold Function
def variance_threshold(X, threshold=0.1):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return X.columns[selector.get_support()]

# Apply Variance Threshold
selected_features = variance_threshold(X)

# Print the features selected by Variance Threshold
print("Features Selected by Variance Threshold:")
for feature in selected_features:
    print(feature)

# Filter the dataset to keep only the selected features
X_selected = X[selected_features]

# Prepare Data for 1D CNN
X_selected = X_selected.values.reshape(X_selected.shape[0], X_selected.shape[1], 1)  # Reshape for CNN
y = y.values  # Convert target variable to numpy array

# K-Fold Cross-Validation
k = 4
kf = KFold(n_splits=k, shuffle=True, random_state=42)
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'auc': []
}

# Early Stopping with improved patience
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=15,
    verbose=1,
    mode='min',
    restore_best_weights=True
)

for fold, (train_index, test_index) in enumerate(kf.split(X_selected)):
    print(f"Training fold {fold+1}/{k}")
    
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Build and Train an improved 1D CNN Model
    model = Sequential([
        # First Conv Block
        Conv1D(filters=32, kernel_size=3, activation='relu', padding='same', 
               input_shape=(X_selected.shape[1], 1)),
        BatchNormalization(),
        MaxPooling1D(pool_size=2, padding='same'),
        Dropout(0.2),
        
        # Second Conv Block
        Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2, padding='same'),
        Dropout(0.3),
        
        # Third Conv Block (optional, depending on your feature size)
        Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2, padding='same'),
        Dropout(0.4),
        
        # Flattening and Dense layers
        Flatten(),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dense(1, activation='sigmoid')
    ])
    
    # Compile model with binary classification setup
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Print model summary for the first fold
    if fold == 0:
        model.summary()
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        verbose=1,
        validation_split=0.2,
        callbacks=[early_stop]
    )
    
    # Evaluate the Model
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class predictions
    
    # Calculate and store metrics
    metrics['accuracy'].append(accuracy_score(y_test, y_pred_classes))
    metrics['precision'].append(precision_score(y_test, y_pred_classes))
    metrics['recall'].append(recall_score(y_test, y_pred_classes))
    metrics['f1'].append(f1_score(y_test, y_pred_classes))
    metrics['auc'].append(roc_auc_score(y_test, y_pred))
    
    # Print fold results
    print(f"Fold {fold+1} Results:")
    print(f"Accuracy: {metrics['accuracy'][-1]:.4f}")
    print(f"Precision: {metrics['precision'][-1]:.4f}")
    print(f"Recall: {metrics['recall'][-1]:.4f}")
    print(f"F1 Score: {metrics['f1'][-1]:.4f}")
    print(f"AUC: {metrics['auc'][-1]:.4f}")
    print("-" * 40)

# Calculate and print average metrics across all folds
print("\nAverage Metrics across all folds:")
for metric, values in metrics.items():
    print(f"Average {metric.capitalize()}: {np.mean(values):.4f} ± {np.std(values):.4f}")

Features Selected by Variance Threshold:
Breathing Problem
Sore throat
Running Nose
Asthma
Chronic Lung Disease
Headache
Heart Disease
Diabetes
Hyper Tension
Fatigue 
Gastrointestinal 
Abroad travel
Contact with COVID Patient
Attended Large Gathering
Visited Public Exposed Places
Family working in Public Exposed Places
Training fold 1/4


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 13ms/step - accuracy: 0.8645 - loss: 0.3087 - val_accuracy: 0.9956 - val_loss: 0.0239
Epoch 2/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.9569 - loss: 0.1175 - val_accuracy: 0.9963 - val_loss: 0.0148
Epoch 3/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9695 - loss: 0.0931 - val_accuracy: 0.9961 - val_loss: 0.0143
Epoch 4/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.9746 - loss: 0.0743 - val_accuracy: 0.9965 - val_loss: 0.0135
Epoch 5/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9755 - loss: 0.0784 - val_accuracy: 0.9968 - val_loss: 0.0131
Epoch 6/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.9791 - loss: 0.0638 - val_accuracy: 0.9965 - val_loss: 0.0139
Epoch 7/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 13ms/step - accuracy: 0.8782 - loss: 0.2803 - val_accuracy: 0.9954 - val_loss: 0.0266
Epoch 2/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9656 - loss: 0.0986 - val_accuracy: 0.9968 - val_loss: 0.0125
Epoch 3/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.9717 - loss: 0.0817 - val_accuracy: 0.9965 - val_loss: 0.0121
Epoch 4/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9723 - loss: 0.0756 - val_accuracy: 0.9977 - val_loss: 0.0085
Epoch 5/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9770 - loss: 0.0671 - val_accuracy: 0.9979 - val_loss: 0.0077
Epoch 6/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9799 - loss: 0.0598 - val_accuracy: 0.9977 - val_loss: 0.0098
Epoch 7/100
[

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - accuracy: 0.8691 - loss: 0.2992 - val_accuracy: 0.9972 - val_loss: 0.0187
Epoch 2/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9599 - loss: 0.1148 - val_accuracy: 0.9965 - val_loss: 0.0117
Epoch 3/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9696 - loss: 0.0845 - val_accuracy: 0.9977 - val_loss: 0.0089
Epoch 4/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.9738 - loss: 0.0721 - val_accuracy: 0.9975 - val_loss: 0.0101
Epoch 5/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.9760 - loss: 0.0699 - val_accuracy: 0.9979 - val_loss: 0.0091
Epoch 6/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.9784 - loss: 0.0600 - val_accuracy: 0.9972 - val_loss: 0.0094
Epoch 7/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.8913 - loss: 0.2654 - val_accuracy: 0.9949 - val_loss: 0.0260
Epoch 2/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.9655 - loss: 0.1008 - val_accuracy: 0.9961 - val_loss: 0.0178
Epoch 3/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9717 - loss: 0.0843 - val_accuracy: 0.9942 - val_loss: 0.0170
Epoch 4/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9756 - loss: 0.0768 - val_accuracy: 0.9963 - val_loss: 0.0146
Epoch 5/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9762 - loss: 0.0717 - val_accuracy: 0.9963 - val_loss: 0.0151
Epoch 6/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.9807 - loss: 0.0581 - val_accuracy: 0.9961 - val_loss: 0.0141
Epoch 7/100
[

# Suitable FS comparitive analysis 

In [14]:
# Dependencies installation (run these commands in your terminal)
# pip install pandas numpy scikit-learn tensorflow keras matplotlib seaborn xgboost lightgbm boruta
# pip install imbalanced-learn statsmodels scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import os
import warnings
warnings.filterwarnings('ignore')

# Feature Selection Libraries
from sklearn.feature_selection import (
    VarianceThreshold, chi2, f_classif, mutual_info_classif, 
    SelectKBest, RFE, SelectFromModel
)
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from boruta import BorutaPy
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector

# ML and Evaluation
from keras.models import Sequential
from keras.layers import (
    Conv1D, MaxPooling1D, Dropout, Flatten, Dense, BatchNormalization
)
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# Load dataset
def load_data(file_path):
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    X = df.drop(columns=['COVID-19'])
    y = df['COVID-19']
    print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features")
    print(f"Class distribution: {dict(y.value_counts())}")
    return X, y

# Define all feature selection techniques
def get_feature_selectors(X, y, n_features=10):
    print("Initializing feature selection techniques...")
    feature_selectors = {
        "1. Chi-Square": SelectKBest(chi2, k=n_features),
        "2. Mutual Information": SelectKBest(mutual_info_classif, k=n_features),
        "3. Recursive Feature Elimination": RFE(
            estimator=LogisticRegression(solver='liblinear', max_iter=1000, random_state=42),
            n_features_to_select=n_features
        ),
        "4. Lasso": SelectFromModel(
            Lasso(alpha=0.01, random_state=42), max_features=n_features
        ),
        "5. Random Forest Importance": SelectFromModel(
            RandomForestClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "6. Boruta": BorutaPy(
            RandomForestClassifier(n_estimators=100, random_state=42),
            n_estimators='auto', verbose=0, random_state=42
        ),
        "7. Correlation-based": None,  # Custom implementation
        "8. Sequential Forward Selection": SequentialFeatureSelector(
            RandomForestClassifier(n_estimators=50, random_state=42),
            n_features_to_select=n_features,
            direction='forward'
        ),
        "9. XGBoost Importance": SelectFromModel(
            XGBClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "10. LightGBM Importance": SelectFromModel(
            LGBMClassifier(n_estimators=100, random_state=42), max_features=n_features
        )
    }
    return feature_selectors

# Function to select top features using each technique
def select_features(X, y, technique_name, selector, n_features=10):
    print(f"Selecting features using {technique_name}...")
    feature_names = X.columns.tolist()
    
    # Handle special case for Correlation-based selection
    if technique_name == "7. Correlation-based":
        # Calculate correlation of each feature with target
        correlations = []
        for col in X.columns:
            corr = np.abs(pd.crosstab(X[col], y, normalize='columns').iloc[1, 1] - 
                          pd.crosstab(X[col], y, normalize='columns').iloc[1, 0])
            correlations.append((col, corr))
        
        # Sort by correlation and select top n_features
        correlations.sort(key=lambda x: x[1], reverse=True)
        selected_features = [item[0] for item in correlations[:n_features]]
        feature_importances = [item[1] for item in correlations[:n_features]]
        
    # Handle special case for Boruta
    elif technique_name == "6. Boruta":
        # Boruta requires array input
        X_array = X.values
        selector.fit(X_array, y)
        
        # Get the selected features
        selected_mask = selector.support_
        ranking = selector.ranking_
        
        # Sort by ranking and select top features
        feature_ranking = [(feature, rank) for feature, rank, mask in 
                          zip(feature_names, ranking, selected_mask) if mask]
        feature_ranking.sort(key=lambda x: x[1])
        
        # If Boruta selected fewer than n_features, add more by ranking
        if len(feature_ranking) < n_features:
            additional = [(f, r) for f, r, m in 
                         zip(feature_names, ranking, selected_mask) if not m]
            additional.sort(key=lambda x: x[1])
            feature_ranking.extend(additional[:n_features-len(feature_ranking)])
        
        feature_ranking = feature_ranking[:n_features]
        selected_features = [item[0] for item in feature_ranking]
        feature_importances = [1.0/item[1] for item in feature_ranking]  # Invert ranking for visualization
    
    else:
        # Standard scikit-learn selectors
        try:
            selector.fit(X, y)
            
            # Different selector types have different ways to get selected features
            if hasattr(selector, 'get_support'):
                selected_mask = selector.get_support()
                selected_features = [f for f, selected in zip(feature_names, selected_mask) if selected]
                
                # Get feature importances if available
                if hasattr(selector, 'estimator_') and hasattr(selector.estimator_, 'feature_importances_'):
                    feature_importances = selector.estimator_.feature_importances_[selected_mask]
                elif hasattr(selector, 'scores_'):
                    feature_importances = selector.scores_[selected_mask]
                else:
                    feature_importances = np.ones(len(selected_features))
                    
            elif hasattr(selector, 'coef_'):
                # For models with coefficients like Lasso
                coefs = np.abs(selector.coef_)
                indices = np.argsort(coefs)[::-1][:n_features]
                selected_features = [feature_names[i] for i in indices]
                feature_importances = [coefs[i] for i in indices]
                
            else:
                # Get features from the model itself
                try:
                    importances = getattr(selector, 'feature_importances_', 
                                         getattr(selector, 'coef_', None))
                    if importances is None:
                        importances = np.ones(len(feature_names))
                    
                    # For 2D coefficients (like in multiclass), take the mean
                    if importances.ndim > 1:
                        importances = np.mean(np.abs(importances), axis=0)
                    
                    # Select top features
                    indices = np.argsort(np.abs(importances))[::-1][:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = [np.abs(importances)[i] for i in indices]
                    
                except:
                    # Fallback for other selectors
                    indices = getattr(selector, 'support_', np.arange(min(n_features, len(feature_names))))
                    if len(indices) > n_features:
                        indices = indices[:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = np.ones(len(selected_features))
        
        except Exception as e:
            print(f"Error with {technique_name}: {str(e)}")
            # Default to the first n_features if there's an error
            selected_features = feature_names[:n_features]
            feature_importances = np.ones(n_features)
    
    # Ensure exactly n_features are selected (truncate or pad if necessary)
    if len(selected_features) > n_features:
        selected_features = selected_features[:n_features]
        feature_importances = feature_importances[:n_features]
    elif len(selected_features) < n_features:
        # Add remaining features based on variance
        remaining = [f for f in feature_names if f not in selected_features]
        selected_features.extend(remaining[:n_features-len(selected_features)])
        feature_importances = list(feature_importances) + [0] * (n_features - len(feature_importances))
    
    # Print selected features
    print(f"Top {len(selected_features)} features selected by {technique_name}:")
    for i, (feature, importance) in enumerate(zip(selected_features, feature_importances)):
        print(f"{i+1}. {feature}: {importance:.4f}")
    
    return selected_features, feature_importances

# Build the CNN model for a specific set of features
def build_cnn_model(input_shape):
    model = Sequential([
        # First Conv Block
        Conv1D(filters=32, kernel_size=3, activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2, padding='same'),
        Dropout(0.2),
        
        # Second Conv Block
        Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2, padding='same'),
        Dropout(0.3),
        
        # Flattening and Dense layers
        Flatten(),
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train and evaluate model with k-fold cross validation
def train_and_evaluate(X, y, selected_features, technique_name, k=5):
    print(f"\nTraining CNN with features selected by {technique_name}")
    
    # Prepare data for CNN
    X_selected = X[selected_features].values
    X_selected = X_selected.reshape(X_selected.shape[0], X_selected.shape[1], 1)
    y_values = y.values
    
    # K-Fold validation
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'auc': []
    }
    
    # Define early stopping
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=10,
        verbose=0,
        mode='min',
        restore_best_weights=True
    )
    
    # Train and evaluate for each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_selected)):
        print(f"Training fold {fold+1}/{k}...")
        
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y_values[train_idx], y_values[test_idx]
        
        # Build and train model
        model = build_cnn_model((X_selected.shape[1], 1))
        model.fit(
            X_train, y_train,
            epochs=50,  # Reduced from 100 for faster execution
            batch_size=32,
            verbose=0,
            validation_split=0.2,
            callbacks=[early_stop]
        )
        
        # Evaluate
        y_pred = model.predict(X_test)
        y_pred_classes = (y_pred > 0.5).astype(int)
        
        # Calculate metrics
        metrics['accuracy'].append(accuracy_score(y_test, y_pred_classes))
        metrics['precision'].append(precision_score(y_test, y_pred_classes))
        metrics['recall'].append(recall_score(y_test, y_pred_classes))
        metrics['f1'].append(f1_score(y_test, y_pred_classes))
        try:
            metrics['auc'].append(roc_auc_score(y_test, y_pred))
        except:
            metrics['auc'].append(0.5)  # Default for failed AUC calculation
    
    # Calculate average metrics
    avg_metrics = {metric: np.mean(values) for metric, values in metrics.items()}
    std_metrics = {metric: np.std(values) for metric, values in metrics.items()}
    
    print(f"\nResults for {technique_name}:")
    for metric, value in avg_metrics.items():
        print(f"Average {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")
    
    return avg_metrics

# Plot comparison bar chart
def plot_comparison(all_results):
    metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'auc']
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    # Set up the figure
    plt.figure(figsize=(15, 10))
    
    # Set width of bars
    bar_width = 0.15
    index = np.arange(len(sorted_techniques))
    
    # Colors for different metrics
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
    
    # Plot bars for each metric
    for i, metric in enumerate(metrics_to_plot):
        values = [all_results[technique][metric] for technique in sorted_techniques]
        plt.bar(
            index + i * bar_width, 
            values, 
            bar_width, 
            label=metric.capitalize(),
            color=colors[i]
        )
    
    # Add labels and legend
    plt.xlabel('Feature Selection Technique', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.title('Comparison of Feature Selection Techniques', fontsize=14)
    plt.xticks(
        index + bar_width * 2, 
        [t.split('. ')[1] if '. ' in t else t for t in sorted_techniques],
        rotation=45,
        ha='right'
    )
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=5)
    plt.ylim(0, 1.0)
    plt.tight_layout()
    
    # Save figure
    plt.savefig('feature_selection_comparison.png', dpi=300, bbox_inches='tight')
    print("Comparison chart saved as 'feature_selection_comparison.png'")
    plt.close()

# Plot feature heatmap
def plot_feature_heatmap(all_features, X):
    # Create a matrix of features vs techniques
    techniques = list(all_features.keys())
    all_unique_features = list(set(feature for features in all_features.values() for feature in features))
    
    # Create a matrix with 1 if feature is selected by technique, 0 otherwise
    matrix = np.zeros((len(techniques), len(all_unique_features)))
    
    for i, technique in enumerate(techniques):
        for j, feature in enumerate(all_unique_features):
            if feature in all_features[technique]:
                matrix[i, j] = 1
    
    # Sort features by frequency of selection
    feature_counts = matrix.sum(axis=0)
    sorted_indices = np.argsort(feature_counts)[::-1]
    sorted_features = [all_unique_features[i] for i in sorted_indices]
    sorted_matrix = matrix[:, sorted_indices]
    
    # Create heatmap
    plt.figure(figsize=(20, 12))
    sns.heatmap(
        sorted_matrix,
        cmap='Blues',
        xticklabels=sorted_features,
        yticklabels=[t.split('. ')[1] if '. ' in t else t for t in techniques],
        cbar_kws={'label': 'Selected'}
    )
    plt.title('Feature Selection by Different Techniques', fontsize=14)
    plt.tight_layout()
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    
    # Save figure
    plt.savefig('feature_selection_heatmap.png', dpi=300, bbox_inches='tight')
    print("Heatmap saved as 'feature_selection_heatmap.png'")
    plt.close()

# Main function to run the whole process
def main():
    # Load data
    file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with your path
    X, y = load_data(file_path)
    
    # Get feature selectors
    feature_selectors = get_feature_selectors(X, y)
    
    # Store results
    all_results = {}
    all_selected_features = {}
    
    # For each technique, select features and train model
    for technique_name, selector in feature_selectors.items():
        print("\n" + "="*50)
        print(f"Processing {technique_name}")
        print("="*50)
        
        # Select features
        selected_features, _ = select_features(X, y, technique_name, selector)
        all_selected_features[technique_name] = selected_features
        
        # Train and evaluate
        results = train_and_evaluate(X, y, selected_features, technique_name)
        all_results[technique_name] = results
    
    # Plot comparison
    plot_comparison(all_results)
    
    # Plot feature heatmap
    plot_feature_heatmap(all_selected_features, X)
    
    # Print final summary
    print("\n" + "="*50)
    print("FINAL SUMMARY")
    print("="*50)
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    print("\nTechniques ranked by accuracy:")
    for i, technique in enumerate(sorted_techniques):
        print(f"{i+1}. {technique}: {all_results[technique]['accuracy']:.4f}")
    
    best_technique = sorted_techniques[0]
    print(f"\nBest performing technique: {best_technique}")
    print(f"Top 10 features selected by {best_technique}:")
    for i, feature in enumerate(all_selected_features[best_technique]):
        print(f"{i+1}. {feature}")

if __name__ == "__main__":
    main()

Loading dataset...
Dataset loaded: 28766 samples, 20 features
Class distribution: {0: 18514, 1: 10252}
Initializing feature selection techniques...

Processing 1. Chi-Square
Selecting features using 1. Chi-Square...
Top 10 features selected by 1. Chi-Square:
1. Breathing Problem: 9964.0393
2. Sore throat: 13201.1649
3. Heart Disease: 3552.0005
4. Diabetes: 1929.1672
5. Hyper Tension: 3089.5719
6. Gastrointestinal : 1693.0652
7. Abroad travel: 11288.4346
8. Contact with COVID Patient: 8194.1488
9. Attended Large Gathering: 9140.5020
10. Family working in Public Exposed Places: 7781.7278

Training CNN with features selected by 1. Chi-Square
Training fold 1/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Training fold 2/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Training fold 3/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Training fold 4/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0

IndexError: index 1 is out of bounds for axis 0 with size 1

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import os
import warnings
warnings.filterwarnings('ignore')

# Feature Selection Libraries
from sklearn.feature_selection import (
    VarianceThreshold, chi2, f_classif, mutual_info_classif, 
    SelectKBest, RFE, SelectFromModel
)
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from boruta import BorutaPy
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector

# ML and Evaluation
from keras.models import Sequential
from keras.layers import (
    Conv1D, MaxPooling1D, Dropout, Flatten, Dense, BatchNormalization
)
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# Load dataset
def load_data(file_path):
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    X = df.drop(columns=['COVID-19'])
    y = df['COVID-19']
    print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features")
    print(f"Class distribution: {dict(y.value_counts())}")
    return X, y

# Define all feature selection techniques
def get_feature_selectors(X, y, n_features=10):
    print("Initializing feature selection techniques...")
    feature_selectors = {
        "7. Correlation-based": None,  # Custom implementation
        "8. Sequential Forward Selection": SequentialFeatureSelector(
            RandomForestClassifier(n_estimators=50, random_state=42),
            n_features_to_select=n_features,
            direction='forward'
        ),
        "9. XGBoost Importance": SelectFromModel(
            XGBClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "10. LightGBM Importance": SelectFromModel(
            LGBMClassifier(n_estimators=100, random_state=42), max_features=n_features
        )
    }
    return feature_selectors

# Function to select top features using each technique
def select_features(X, y, technique_name, selector, n_features=10):
    print(f"Selecting features using {technique_name}...")
    feature_names = X.columns.tolist()
    
    # Handle special case for Correlation-based selection
    if technique_name == "7. Correlation-based":
        # Calculate correlation of each feature with target (safer implementation)
        correlations = []
        for col in X.columns:
            # Create a contingency table
            contingency = pd.crosstab(X[col], y)
            # If binary feature, use direct correlation
            if contingency.shape[0] == 2 and contingency.shape[1] == 2:
                # Calculate correlation coefficient (normalize to have sum=1 for each column)
                normalized = contingency.apply(lambda x: x / x.sum(), axis=0)
                try:
                    corr = abs(normalized.iloc[1, 1] - normalized.iloc[1, 0])
                except:
                    corr = 0
            else:
                # For non-binary features, use chi-squared statistic
                from scipy.stats import chi2_contingency
                try:
                    chi2_stat, p_val, _, _ = chi2_contingency(contingency)
                    corr = chi2_stat
                except:
                    corr = 0
            correlations.append((col, corr))
        
        # Sort by correlation and select top n_features
        correlations.sort(key=lambda x: x[1], reverse=True)
        selected_features = [item[0] for item in correlations[:n_features]]
        feature_importances = [item[1] for item in correlations[:n_features]]
    else:
        # Standard scikit-learn selectors
        try:
            selector.fit(X, y)
            
            # Different selector types have different ways to get selected features
            if hasattr(selector, 'get_support'):
                selected_mask = selector.get_support()
                selected_features = [f for f, selected in zip(feature_names, selected_mask) if selected]
                
                # Get feature importances if available
                if hasattr(selector, 'estimator_') and hasattr(selector.estimator_, 'feature_importances_'):
                    feature_importances = selector.estimator_.feature_importances_[selected_mask]
                elif hasattr(selector, 'scores_'):
                    feature_importances = selector.scores_[selected_mask]
                else:
                    feature_importances = np.ones(len(selected_features))
                    
            elif hasattr(selector, 'coef_'):
                # For models with coefficients like Lasso
                coefs = np.abs(selector.coef_)
                indices = np.argsort(coefs)[::-1][:n_features]
                selected_features = [feature_names[i] for i in indices]
                feature_importances = [coefs[i] for i in indices]
                
            else:
                # Get features from the model itself
                try:
                    importances = getattr(selector, 'feature_importances_', 
                                         getattr(selector, 'coef_', None))
                    if importances is None:
                        importances = np.ones(len(feature_names))
                    
                    # For 2D coefficients (like in multiclass), take the mean
                    if importances.ndim > 1:
                        importances = np.mean(np.abs(importances), axis=0)
                    
                    # Select top features
                    indices = np.argsort(np.abs(importances))[::-1][:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = [np.abs(importances)[i] for i in indices]
                    
                except:
                    # Fallback for other selectors
                    indices = getattr(selector, 'support_', np.arange(min(n_features, len(feature_names))))
                    if len(indices) > n_features:
                        indices = indices[:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = np.ones(len(selected_features))
        
        except Exception as e:
            print(f"Error with {technique_name}: {str(e)}")
            # Default to the first n_features if there's an error
            selected_features = feature_names[:n_features]
            feature_importances = np.ones(n_features)
    
    # Ensure exactly n_features are selected (truncate or pad if necessary)
    if len(selected_features) > n_features:
        selected_features = selected_features[:n_features]
        feature_importances = feature_importances[:n_features]
    elif len(selected_features) < n_features:
        # Add remaining features based on variance
        remaining = [f for f in feature_names if f not in selected_features]
        selected_features.extend(remaining[:n_features-len(selected_features)])
        feature_importances = list(feature_importances) + [0] * (n_features - len(feature_importances))
    
    # Print selected features
    print(f"Top {len(selected_features)} features selected by {technique_name}:")
    for i, (feature, importance) in enumerate(zip(selected_features, feature_importances)):
        print(f"{i+1}. {feature}: {importance:.4f}")
    
    return selected_features, feature_importances

# Build the CNN model for a specific set of features
def build_cnn_model(input_shape):
    model = Sequential([
        # First Conv Block
        Conv1D(filters=32, kernel_size=3, activation='relu', padding='same', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling1D(pool_size=2, padding='same'),
        Dropout(0.2),
        
        # Second Conv Block
        Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling1D(pool_size=2, padding='same'),
        Dropout(0.3),
        
        # Flattening and Dense layers
        Flatten(),
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train and evaluate model with k-fold cross validation
def train_and_evaluate(X, y, selected_features, technique_name, k=5):
    print(f"\nTraining CNN with features selected by {technique_name}")
    
    # Prepare data for CNN
    X_selected = X[selected_features].values
    X_selected = X_selected.reshape(X_selected.shape[0], X_selected.shape[1], 1)
    y_values = y.values
    
    # K-Fold validation
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'auc': []
    }
    
    # Define early stopping
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=10,
        verbose=0,
        mode='min',
        restore_best_weights=True
    )
    
    # Train and evaluate for each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_selected)):
        print(f"Training fold {fold+1}/{k}...")
        
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y_values[train_idx], y_values[test_idx]
        
        # Build and train model
        model = build_cnn_model((X_selected.shape[1], 1))
        model.fit(
            X_train, y_train,
            epochs=50,  # Reduced from 100 for faster execution
            batch_size=32,
            verbose=0,
            validation_split=0.2,
            callbacks=[early_stop]
        )
        
        # Evaluate
        y_pred = model.predict(X_test)
        y_pred_classes = (y_pred > 0.5).astype(int)
        
        # Calculate metrics
        metrics['accuracy'].append(accuracy_score(y_test, y_pred_classes))
        metrics['precision'].append(precision_score(y_test, y_pred_classes))
        metrics['recall'].append(recall_score(y_test, y_pred_classes))
        metrics['f1'].append(f1_score(y_test, y_pred_classes))
        try:
            metrics['auc'].append(roc_auc_score(y_test, y_pred))
        except:
            metrics['auc'].append(0.5)  # Default for failed AUC calculation
    
    # Calculate average metrics
    avg_metrics = {metric: np.mean(values) for metric, values in metrics.items()}
    std_metrics = {metric: np.std(values) for metric, values in metrics.items()}
    
    print(f"\nResults for {technique_name}:")
    for metric, value in avg_metrics.items():
        print(f"Average {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")
    
    return avg_metrics

# Plot comparison bar chart
def plot_comparison(all_results):
    metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'auc']
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    # Set up the figure
    plt.figure(figsize=(15, 10))
    
    # Set width of bars
    bar_width = 0.15
    index = np.arange(len(sorted_techniques))
    
    # Colors for different metrics
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
    
    # Plot bars for each metric
    for i, metric in enumerate(metrics_to_plot):
        values = [all_results[technique][metric] for technique in sorted_techniques]
        plt.bar(
            index + i * bar_width, 
            values, 
            bar_width, 
            label=metric.capitalize(),
            color=colors[i]
        )
    
    # Add labels and legend
    plt.xlabel('Feature Selection Technique', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.title('Comparison of Feature Selection Techniques', fontsize=14)
    plt.xticks(
        index + bar_width * 2, 
        [t.split('. ')[1] if '. ' in t else t for t in sorted_techniques],
        rotation=45,
        ha='right'
    )
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=5)
    plt.ylim(0, 1.0)
    plt.tight_layout()
    
    # Save figure
    plt.savefig('feature_selection_comparison.png', dpi=300, bbox_inches='tight')
    print("Comparison chart saved as 'feature_selection_comparison.png'")
    plt.close()

# Plot feature heatmap
def plot_feature_heatmap(all_features, X):
    # Create a matrix of features vs techniques
    techniques = list(all_features.keys())
    all_unique_features = list(set(feature for features in all_features.values() for feature in features))
    
    # Create a matrix with 1 if feature is selected by technique, 0 otherwise
    matrix = np.zeros((len(techniques), len(all_unique_features)))
    
    for i, technique in enumerate(techniques):
        for j, feature in enumerate(all_unique_features):
            if feature in all_features[technique]:
                matrix[i, j] = 1
    
    # Sort features by frequency of selection
    feature_counts = matrix.sum(axis=0)
    sorted_indices = np.argsort(feature_counts)[::-1]
    sorted_features = [all_unique_features[i] for i in sorted_indices]
    sorted_matrix = matrix[:, sorted_indices]
    
    # Create heatmap
    plt.figure(figsize=(20, 12))
    sns.heatmap(
        sorted_matrix,
        cmap='Blues',
        xticklabels=sorted_features,
        yticklabels=[t.split('. ')[1] if '. ' in t else t for t in techniques],
        cbar_kws={'label': 'Selected'}
    )
    plt.title('Feature Selection by Different Techniques', fontsize=14)
    plt.tight_layout()
    

Loading dataset...
Dataset loaded: 28766 samples, 20 features
Class distribution: {0: 18514, 1: 10252}
Initializing feature selection techniques...

Processing 7. Correlation-based
Selecting features using 7. Correlation-based...
Top 10 features selected by 7. Correlation-based:
1. Sore throat: 0.8628
2. Breathing Problem: 0.7620
3. Attended Large Gathering: 0.6493
4. Family working in Public Exposed Places: 0.6421
5. Abroad travel: 0.6099
6. Contact with COVID Patient: 0.5177
7. Diabetes: 0.4520
8. Heart Disease: 0.4011
9. Gastrointestinal : 0.3945
10. Hyper Tension: 0.3857

Training CNN with features selected by 7. Correlation-based
Training fold 1/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Training fold 2/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Training fold 3/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Training fold 4/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<H1>ANN</H1>

In [23]:
# Dependencies installation (run these commands in your terminal)
# pip install pandas numpy scikit-learn tensorflow keras matplotlib seaborn xgboost lightgbm boruta
# pip install imbalanced-learn statsmodels scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import os
import warnings
warnings.filterwarnings('ignore')

# Feature Selection Libraries
from sklearn.feature_selection import (
    VarianceThreshold, chi2, f_classif, mutual_info_classif, 
    SelectKBest, RFE, SelectFromModel
)
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from boruta import BorutaPy
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector

# ML and Evaluation
from keras.models import Sequential
from keras.layers import (
    Dense, Dropout, BatchNormalization
)
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# Load dataset
def load_data(file_path):
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    X = df.drop(columns=['COVID-19'])
    y = df['COVID-19']
    print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features")
    print(f"Class distribution: {dict(y.value_counts())}")
    return X, y

# Define all feature selection techniques
def get_feature_selectors(X, y, n_features=10):
    print("Initializing feature selection techniques...")
    feature_selectors = {
        "1. Chi-Square": SelectKBest(chi2, k=n_features),
        "2. Mutual Information": SelectKBest(mutual_info_classif, k=n_features),
        "3. Recursive Feature Elimination": RFE(
            estimator=LogisticRegression(solver='liblinear', max_iter=1000, random_state=42),
            n_features_to_select=n_features
        ),
        "4. Lasso": SelectFromModel(
            Lasso(alpha=0.01, random_state=42), max_features=n_features
        ),
        "5. Random Forest Importance": SelectFromModel(
            RandomForestClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "6. Boruta": BorutaPy(
            RandomForestClassifier(n_estimators=100, random_state=42),
            n_estimators='auto', verbose=0, random_state=42
        ),
        "7. Correlation-based": None,  # Custom implementation
        "8. Sequential Forward Selection": SequentialFeatureSelector(
            RandomForestClassifier(n_estimators=50, random_state=42),
            n_features_to_select=n_features,
            direction='forward'
        ),
        "9. XGBoost Importance": SelectFromModel(
            XGBClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "10. LightGBM Importance": SelectFromModel(
            LGBMClassifier(n_estimators=100, random_state=42), max_features=n_features
        )
    }
    return feature_selectors

# Function to select top features using each technique
def select_features(X, y, technique_name, selector, n_features=10):
    print(f"Selecting features using {technique_name}...")
    feature_names = X.columns.tolist()
    
    # Handle special case for Correlation-based selection
    if technique_name == "7. Correlation-based":
        # Calculate correlation of each feature with target
        correlations = []
        for col in X.columns:
            corr = np.abs(X[col].corr(y))
            correlations.append((col, corr))
        
        # Sort by correlation and select top n_features
        correlations.sort(key=lambda x: x[1], reverse=True)
        selected_features = [item[0] for item in correlations[:n_features]]
        feature_importances = [item[1] for item in correlations[:n_features]]
        
    # Handle special case for Boruta
    elif technique_name == "6. Boruta":
        # Boruta requires array input
        X_array = X.values
        selector.fit(X_array, y)
        
        # Get the selected features
        selected_mask = selector.support_
        ranking = selector.ranking_
        
        # Sort by ranking and select top features
        feature_ranking = [(feature, rank) for feature, rank, mask in 
                          zip(feature_names, ranking, selected_mask) if mask]
        feature_ranking.sort(key=lambda x: x[1])
        
        # If Boruta selected fewer than n_features, add more by ranking
        if len(feature_ranking) < n_features:
            additional = [(f, r) for f, r, m in 
                         zip(feature_names, ranking, selected_mask) if not m]
            additional.sort(key=lambda x: x[1])
            feature_ranking.extend(additional[:n_features-len(feature_ranking)])
        
        feature_ranking = feature_ranking[:n_features]
        selected_features = [item[0] for item in feature_ranking]
        feature_importances = [1.0/item[1] for item in feature_ranking]  # Invert ranking for visualization
    
    else:
        # Standard scikit-learn selectors
        try:
            selector.fit(X, y)
            
            # Different selector types have different ways to get selected features
            if hasattr(selector, 'get_support'):
                selected_mask = selector.get_support()
                selected_features = [f for f, selected in zip(feature_names, selected_mask) if selected]
                
                # Get feature importances if available
                if hasattr(selector, 'estimator_') and hasattr(selector.estimator_, 'feature_importances_'):
                    feature_importances = selector.estimator_.feature_importances_[selected_mask]
                elif hasattr(selector, 'scores_'):
                    feature_importances = selector.scores_[selected_mask]
                else:
                    feature_importances = np.ones(len(selected_features))
                    
            elif hasattr(selector, 'coef_'):
                # For models with coefficients like Lasso
                coefs = np.abs(selector.coef_)
                indices = np.argsort(coefs)[::-1][:n_features]
                selected_features = [feature_names[i] for i in indices]
                feature_importances = [coefs[i] for i in indices]
                
            else:
                # Get features from the model itself
                try:
                    importances = getattr(selector, 'feature_importances_', 
                                         getattr(selector, 'coef_', None))
                    if importances is None:
                        importances = np.ones(len(feature_names))
                    
                    # For 2D coefficients (like in multiclass), take the mean
                    if importances.ndim > 1:
                        importances = np.mean(np.abs(importances), axis=0)
                    
                    # Select top features
                    indices = np.argsort(np.abs(importances))[::-1][:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = [np.abs(importances)[i] for i in indices]
                    
                except:
                    # Fallback for other selectors
                    indices = getattr(selector, 'support_', np.arange(min(n_features, len(feature_names))))
                    if len(indices) > n_features:
                        indices = indices[:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = np.ones(len(selected_features))
        
        except Exception as e:
            print(f"Error with {technique_name}: {str(e)}")
            # Default to the first n_features if there's an error
            selected_features = feature_names[:n_features]
            feature_importances = np.ones(n_features)
    
    # Ensure exactly n_features are selected (truncate or pad if necessary)
    if len(selected_features) > n_features:
        selected_features = selected_features[:n_features]
        feature_importances = feature_importances[:n_features]
    elif len(selected_features) < n_features:
        # Add remaining features based on variance
        remaining = [f for f in feature_names if f not in selected_features]
        selected_features.extend(remaining[:n_features-len(selected_features)])
        feature_importances = list(feature_importances) + [0] * (n_features - len(feature_importances))
    
    # Print selected features
    print(f"Top {len(selected_features)} features selected by {technique_name}:")
    for i, (feature, importance) in enumerate(zip(selected_features, feature_importances)):
        print(f"{i+1}. {feature}: {importance:.4f}")
    
    return selected_features, feature_importances

# Build the ANN model for a specific set of features
def build_ann_model(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=input_shape),
        Dropout(0.2),
        BatchNormalization(),
        Dense(32, activation='relu'),
        Dropout(0.3),
        BatchNormalization(),
        Dense(16, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train and evaluate model with k-fold cross validation
def train_and_evaluate(X, y, selected_features, technique_name, k=5):
    print(f"\nTraining ANN with features selected by {technique_name}")
    
    # Prepare data for ANN
    X_selected = X[selected_features].values
    y_values = y.values
    
    # K-Fold validation
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'auc': []
    }
    
    # Define early stopping
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=10,
        verbose=0,
        mode='min',
        restore_best_weights=True
    )
    
    # Train and evaluate for each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_selected)):
        print(f"Training fold {fold+1}/{k}...")
        
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y_values[train_idx], y_values[test_idx]
        
        # Build and train model
        model = build_ann_model((X_selected.shape[1],))
        model.fit(
            X_train, y_train,
            epochs=50,  # Reduced from 100 for faster execution
            batch_size=32,
            verbose=0,
            validation_split=0.2,
            callbacks=[early_stop]
        )
        
        # Evaluate
        y_pred = model.predict(X_test)
        y_pred_classes = (y_pred > 0.5).astype(int)
        
        # Calculate metrics
        metrics['accuracy'].append(accuracy_score(y_test, y_pred_classes))
        metrics['precision'].append(precision_score(y_test, y_pred_classes))
        metrics['recall'].append(recall_score(y_test, y_pred_classes))
        metrics['f1'].append(f1_score(y_test, y_pred_classes))
        try:
            metrics['auc'].append(roc_auc_score(y_test, y_pred))
        except:
            metrics['auc'].append(0.5)  # Default for failed AUC calculation
    
    # Calculate average metrics
    avg_metrics = {metric: np.mean(values) for metric, values in metrics.items()}
    std_metrics = {metric: np.std(values) for metric, values in metrics.items()}
    
    print(f"\nResults for {technique_name}:")
    for metric, value in avg_metrics.items():
        print(f"Average {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")
    
    return avg_metrics

# Plot comparison bar chart
def plot_comparison(all_results):
    metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'auc']
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    # Set up the figure
    plt.figure(figsize=(15, 10))
    
    # Set width of bars
    bar_width = 0.15
    index = np.arange(len(sorted_techniques))
    
    # Colors for different metrics
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
    
    # Plot bars for each metric
    for i, metric in enumerate(metrics_to_plot):
        values = [all_results[technique][metric] for technique in sorted_techniques]
        plt.bar(
            index + i * bar_width, 
            values, 
            bar_width, 
            label=metric.capitalize(),
            color=colors[i]
        )
    
    # Add labels and legend
    plt.xlabel('Feature Selection Technique', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.title('Comparison of Feature Selection Techniques', fontsize=14)
    plt.xticks(
        index + bar_width * 2, 
        [t.split('. ')[1] if '. ' in t else t for t in sorted_techniques],
        rotation=45,
        ha='right'
    )
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=5)
    plt.ylim(0, 1.0)
    plt.tight_layout()
    
    # Save figure
    plt.savefig('feature_selection_comparison_ANN.png', dpi=300, bbox_inches='tight')
    print("Comparison chart saved as 'feature_selection_comparison_ANN.png'")
    plt.close()

# Plot feature heatmap
def plot_feature_heatmap(all_features, X):
    # Create a matrix of features vs techniques
    techniques = list(all_features.keys())
    all_unique_features = list(set(feature for features in all_features.values() for feature in features))
    
    # Create a matrix with 1 if feature is selected by technique, 0 otherwise
    matrix = np.zeros((len(techniques), len(all_unique_features)))
    
    for i, technique in enumerate(techniques):
        for j, feature in enumerate(all_unique_features):
            if feature in all_features[technique]:
                matrix[i, j] = 1
    
    # Sort features by frequency of selection
    feature_counts = matrix.sum(axis=0)
    sorted_indices = np.argsort(feature_counts)[::-1]
    sorted_features = [all_unique_features[i] for i in sorted_indices]
    sorted_matrix = matrix[:, sorted_indices]
    
    # Create heatmap
    plt.figure(figsize=(20, 12))
    sns.heatmap(
        sorted_matrix,
        cmap='Blues',
        xticklabels=sorted_features,
        yticklabels=[t.split('. ')[1] if '. ' in t else t for t in techniques],
        cbar_kws={'label': 'Selected'}
    )
    plt.title('Feature Selection by Different Techniques', fontsize=14)
    plt.tight_layout()
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    
    # Save figure
    plt.savefig('feature_selection_heatmap_ANN.png', dpi=300, bbox_inches='tight')
    print("Heatmap saved as 'feature_selection_heatmap_ANN.png'")
    plt.close()

# Main function to run the whole process
def main():
    # Load data
    file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with your path
    X, y = load_data(file_path)
    
    # Get feature selectors
    feature_selectors = get_feature_selectors(X, y)
    
    # Store results
    all_results = {}
    all_selected_features = {}
    
    # For each technique, select features and train model
    for technique_name, selector in feature_selectors.items():
        print("\n" + "="*50)
        print(f"Processing {technique_name}")
        print("="*50)
        
        # Select features
        selected_features, _ = select_features(X, y, technique_name, selector)
        all_selected_features[technique_name] = selected_features
        
        # Train and evaluate
        results = train_and_evaluate(X, y, selected_features, technique_name)
        all_results[technique_name] = results
    
    # Plot comparison
    plot_comparison(all_results)
    
    # Plot feature heatmap
    plot_feature_heatmap(all_selected_features, X)
    
    # Print final summary
    print("\n" + "="*50)
    print("FINAL SUMMARY")
    print("="*50)
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    print("\nTechniques ranked by accuracy:")
    for i, technique in enumerate(sorted_techniques):
        print(f"{i+1}. {technique}: {all_results[technique]['accuracy']:.4f}")
    
    best_technique = sorted_techniques[0]
    print(f"\nBest performing technique: {best_technique}")
    print(f"Top 10 features selected by {best_technique}:")
    for i, feature in enumerate(all_selected_features[best_technique]):
        print(f"{i+1}. {feature}")

if __name__ == "__main__":
    main()

Loading dataset...
Dataset loaded: 28766 samples, 20 features
Class distribution: {0: 18514, 1: 10252}
Initializing feature selection techniques...

Processing 1. Chi-Square
Selecting features using 1. Chi-Square...
Top 10 features selected by 1. Chi-Square:
1. Breathing Problem: 9964.0393
2. Sore throat: 13201.1649
3. Heart Disease: 3552.0005
4. Diabetes: 1929.1672
5. Hyper Tension: 3089.5719
6. Gastrointestinal : 1693.0652
7. Abroad travel: 11288.4346
8. Contact with COVID Patient: 8194.1488
9. Attended Large Gathering: 9140.5020
10. Family working in Public Exposed Places: 7781.7278

Training ANN with features selected by 1. Chi-Square
Training fold 1/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Training fold 2/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Training fold 3/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Training fold 4/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<H1>Multi Layer Perceptron</H1>

In [26]:
# Dependencies installation (run these commands in your terminal)
# pip install pandas numpy scikit-learn tensorflow keras matplotlib seaborn xgboost lightgbm boruta
# pip install imbalanced-learn statsmodels scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import os
import warnings
warnings.filterwarnings('ignore')

# Feature Selection Libraries
from sklearn.feature_selection import (
    VarianceThreshold, chi2, f_classif, mutual_info_classif, 
    SelectKBest, RFE, SelectFromModel
)
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from boruta import BorutaPy
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector

# ML and Evaluation
from keras.models import Sequential
from keras.layers import (
    Dense, Dropout, BatchNormalization
)
from keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# Load dataset
def load_data(file_path):
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    X = df.drop(columns=['COVID-19'])
    y = df['COVID-19']
    print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features")
    print(f"Class distribution: {dict(y.value_counts())}")
    return X, y

# Define all feature selection techniques
def get_feature_selectors(X, y, n_features=10):
    print("Initializing feature selection techniques...")
    feature_selectors = {
        "1. Chi-Square": SelectKBest(chi2, k=n_features),
        "2. Mutual Information": SelectKBest(mutual_info_classif, k=n_features),
        "3. Recursive Feature Elimination": RFE(
            estimator=LogisticRegression(solver='liblinear', max_iter=1000, random_state=42),
            n_features_to_select=n_features
        ),
        "4. Lasso": SelectFromModel(
            Lasso(alpha=0.01, random_state=42), max_features=n_features
        ),
        "5. Random Forest Importance": SelectFromModel(
            RandomForestClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "6. Boruta": BorutaPy(
            RandomForestClassifier(n_estimators=100, random_state=42),
            n_estimators='auto', verbose=0, random_state=42
        ),
        "7. Correlation-based": None,  # Custom implementation
        "8. Sequential Forward Selection": SequentialFeatureSelector(
            RandomForestClassifier(n_estimators=50, random_state=42),
            n_features_to_select=n_features,
            direction='forward'
        ),
        "9. XGBoost Importance": SelectFromModel(
            XGBClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "10. LightGBM Importance": SelectFromModel(
            LGBMClassifier(n_estimators=100, random_state=42), max_features=n_features
        )
    }
    return feature_selectors

# Function to select top features using each technique
def select_features(X, y, technique_name, selector, n_features=10):
    print(f"Selecting features using {technique_name}...")
    feature_names = X.columns.tolist()
    
    # Handle special case for Correlation-based selection
    if technique_name == "7. Correlation-based":
        # Calculate correlation of each feature with target
        correlations = []
        for col in X.columns:
            corr = np.abs(X[col].corr(y))
            correlations.append((col, corr))
        
        # Sort by correlation and select top n_features
        correlations.sort(key=lambda x: x[1], reverse=True)
        selected_features = [item[0] for item in correlations[:n_features]]
        feature_importances = [item[1] for item in correlations[:n_features]]
        
    # Handle special case for Boruta
    elif technique_name == "6. Boruta":
        # Boruta requires array input
        X_array = X.values
        selector.fit(X_array, y)
        
        # Get the selected features
        selected_mask = selector.support_
        ranking = selector.ranking_
        
        # Sort by ranking and select top features
        feature_ranking = [(feature, rank) for feature, rank, mask in 
                          zip(feature_names, ranking, selected_mask) if mask]
        feature_ranking.sort(key=lambda x: x[1])
        
        # If Boruta selected fewer than n_features, add more by ranking
        if len(feature_ranking) < n_features:
            additional = [(f, r) for f, r, m in 
                         zip(feature_names, ranking, selected_mask) if not m]
            additional.sort(key=lambda x: x[1])
            feature_ranking.extend(additional[:n_features-len(feature_ranking)])
        
        feature_ranking = feature_ranking[:n_features]
        selected_features = [item[0] for item in feature_ranking]
        feature_importances = [1.0/item[1] for item in feature_ranking]  # Invert ranking for visualization
    
    else:
        # Standard scikit-learn selectors
        try:
            selector.fit(X, y)
            
            # Different selector types have different ways to get selected features
            if hasattr(selector, 'get_support'):
                selected_mask = selector.get_support()
                selected_features = [f for f, selected in zip(feature_names, selected_mask) if selected]
                
                # Get feature importances if available
                if hasattr(selector, 'estimator_') and hasattr(selector.estimator_, 'feature_importances_'):
                    feature_importances = selector.estimator_.feature_importances_[selected_mask]
                elif hasattr(selector, 'scores_'):
                    feature_importances = selector.scores_[selected_mask]
                else:
                    feature_importances = np.ones(len(selected_features))
                    
            elif hasattr(selector, 'coef_'):
                # For models with coefficients like Lasso
                coefs = np.abs(selector.coef_)
                indices = np.argsort(coefs)[::-1][:n_features]
                selected_features = [feature_names[i] for i in indices]
                feature_importances = [coefs[i] for i in indices]
                
            else:
                # Get features from the model itself
                try:
                    importances = getattr(selector, 'feature_importances_', 
                                         getattr(selector, 'coef_', None))
                    if importances is None:
                        importances = np.ones(len(feature_names))
                    
                    # For 2D coefficients (like in multiclass), take the mean
                    if importances.ndim > 1:
                        importances = np.mean(np.abs(importances), axis=0)
                    
                    # Select top features
                    indices = np.argsort(np.abs(importances))[::-1][:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = [np.abs(importances)[i] for i in indices]
                    
                except:
                    # Fallback for other selectors
                    indices = getattr(selector, 'support_', np.arange(min(n_features, len(feature_names))))
                    if len(indices) > n_features:
                        indices = indices[:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = np.ones(len(selected_features))
        
        except Exception as e:
            print(f"Error with {technique_name}: {str(e)}")
            # Default to the first n_features if there's an error
            selected_features = feature_names[:n_features]
            feature_importances = np.ones(n_features)
    
    # Ensure exactly n_features are selected (truncate or pad if necessary)
    if len(selected_features) > n_features:
        selected_features = selected_features[:n_features]
        feature_importances = feature_importances[:n_features]
    elif len(selected_features) < n_features:
        # Add remaining features based on variance
        remaining = [f for f in feature_names if f not in selected_features]
        selected_features.extend(remaining[:n_features-len(selected_features)])
        feature_importances = list(feature_importances) + [0] * (n_features - len(feature_importances))
    
    # Print selected features
    print(f"Top {len(selected_features)} features selected by {technique_name}:")
    for i, (feature, importance) in enumerate(zip(selected_features, feature_importances)):
        print(f"{i+1}. {feature}: {importance:.4f}")
    
    return selected_features, feature_importances

# Build the MLP model for a specific set of features
def build_mlp_model(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=input_shape),  # First hidden layer
        Dropout(0.3),  # Dropout layer for regularization
        BatchNormalization(),  # Batch normalization
        Dense(64, activation='relu'),  # Second hidden layer
        Dropout(0.4),  # Dropout layer for regularization
        BatchNormalization(),  # Batch normalization
        Dense(32, activation='relu'),  # Third hidden layer
        Dropout(0.5),  # Dropout layer for regularization
        Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Train and evaluate model with k-fold cross validation
def train_and_evaluate(X, y, selected_features, technique_name, k=5):
    print(f"\nTraining MLP with features selected by {technique_name}")
    
    # Prepare data for MLP
    X_selected = X[selected_features].values
    y_values = y.values
    
    # K-Fold validation
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'auc': []
    }
    
    # Define early stopping
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=10,
        verbose=0,
        mode='min',
        restore_best_weights=True
    )
    
    # Train and evaluate for each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_selected)):
        print(f"Training fold {fold+1}/{k}...")
        
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y_values[train_idx], y_values[test_idx]
        
        # Build and train model
        model = build_mlp_model((X_selected.shape[1],))
        model.fit(
            X_train, y_train,
            epochs=50,  # Reduced from 100 for faster execution
            batch_size=32,
            verbose=0,
            validation_split=0.2,
            callbacks=[early_stop]
        )
        
        # Evaluate
        y_pred = model.predict(X_test)
        y_pred_classes = (y_pred > 0.5).astype(int)
        
        # Calculate metrics
        metrics['accuracy'].append(accuracy_score(y_test, y_pred_classes))
        metrics['precision'].append(precision_score(y_test, y_pred_classes))
        metrics['recall'].append(recall_score(y_test, y_pred_classes))
        metrics['f1'].append(f1_score(y_test, y_pred_classes))
        try:
            metrics['auc'].append(roc_auc_score(y_test, y_pred))
        except:
            metrics['auc'].append(0.5)  # Default for failed AUC calculation
    
    # Calculate average metrics
    avg_metrics = {metric: np.mean(values) for metric, values in metrics.items()}
    std_metrics = {metric: np.std(values) for metric, values in metrics.items()}
    
    print(f"\nResults for {technique_name}:")
    for metric, value in avg_metrics.items():
        print(f"Average {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")
    
    return avg_metrics

# Plot comparison bar chart
def plot_comparison(all_results):
    metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'auc']
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    # Set up the figure
    plt.figure(figsize=(15, 10))
    
    # Set width of bars
    bar_width = 0.15
    index = np.arange(len(sorted_techniques))
    
    # Colors for different metrics
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
    
    # Plot bars for each metric
    for i, metric in enumerate(metrics_to_plot):
        values = [all_results[technique][metric] for technique in sorted_techniques]
        plt.bar(
            index + i * bar_width, 
            values, 
            bar_width, 
            label=metric.capitalize(),
            color=colors[i]
        )
    
    # Add labels and legend
    plt.xlabel('Feature Selection Technique', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.title('Comparison of Feature Selection Techniques', fontsize=14)
    plt.xticks(
        index + bar_width * 2, 
        [t.split('. ')[1] if '. ' in t else t for t in sorted_techniques],
        rotation=45,
        ha='right'
    )
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=5)
    plt.ylim(0, 1.0)
    plt.tight_layout()
    
    # Save figure
    plt.savefig('feature_selection_comparison_MLP.png', dpi=300, bbox_inches='tight')
    print("Comparison chart saved as 'feature_selection_comparison_MLP.png'")
    plt.close()

# Plot feature heatmap
def plot_feature_heatmap(all_features, X):
    # Create a matrix of features vs techniques
    techniques = list(all_features.keys())
    all_unique_features = list(set(feature for features in all_features.values() for feature in features))
    
    # Create a matrix with 1 if feature is selected by technique, 0 otherwise
    matrix = np.zeros((len(techniques), len(all_unique_features)))
    
    for i, technique in enumerate(techniques):
        for j, feature in enumerate(all_unique_features):
            if feature in all_features[technique]:
                matrix[i, j] = 1
    
    # Sort features by frequency of selection
    feature_counts = matrix.sum(axis=0)
    sorted_indices = np.argsort(feature_counts)[::-1]
    sorted_features = [all_unique_features[i] for i in sorted_indices]
    sorted_matrix = matrix[:, sorted_indices]
    
    # Create heatmap
    plt.figure(figsize=(20, 12))
    sns.heatmap(
        sorted_matrix,
        cmap='Blues',
        xticklabels=sorted_features,
        yticklabels=[t.split('. ')[1] if '. ' in t else t for t in techniques],
        cbar_kws={'label': 'Selected'}
    )
    plt.title('Feature Selection by Different Techniques', fontsize=14)
    plt.tight_layout()
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    
    # Save figure
    plt.savefig('feature_selection_heatmap_MLP.png', dpi=300, bbox_inches='tight')
    print("Heatmap saved as 'feature_selection_heatmap_MLP.png'")
    plt.close()

# Main function to run the whole process
def main():
    # Load data
    file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with your path
    X, y = load_data(file_path)
    
    # Get feature selectors
    feature_selectors = get_feature_selectors(X, y)
    
    # Store results
    all_results = {}
    all_selected_features = {}
    
    # For each technique, select features and train model
    for technique_name, selector in feature_selectors.items():
        print("\n" + "="*50)
        print(f"Processing {technique_name}")
        print("="*50)
        
        # Select features
        selected_features, _ = select_features(X, y, technique_name, selector)
        all_selected_features[technique_name] = selected_features
        
        # Train and evaluate
        results = train_and_evaluate(X, y, selected_features, technique_name)
        all_results[technique_name] = results
    
    # Plot comparison
    plot_comparison(all_results)
    
    # Plot feature heatmap
    plot_feature_heatmap(all_selected_features, X)
    
    # Print final summary
    print("\n" + "="*50)
    print("FINAL SUMMARY")
    print("="*50)
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    print("\nTechniques ranked by accuracy:")
    for i, technique in enumerate(sorted_techniques):
        print(f"{i+1}. {technique}: {all_results[technique]['accuracy']:.4f}")
    
    best_technique = sorted_techniques[0]
    print(f"\nBest performing technique: {best_technique}")
    print(f"Top 10 features selected by {best_technique}:")
    for i, feature in enumerate(all_selected_features[best_technique]):
        print(f"{i+1}. {feature}")

if __name__ == "__main__":
    main()

Loading dataset...
Dataset loaded: 28766 samples, 20 features
Class distribution: {0: 18514, 1: 10252}
Initializing feature selection techniques...

Processing 1. Chi-Square
Selecting features using 1. Chi-Square...
Top 10 features selected by 1. Chi-Square:
1. Breathing Problem: 9964.0393
2. Sore throat: 13201.1649
3. Heart Disease: 3552.0005
4. Diabetes: 1929.1672
5. Hyper Tension: 3089.5719
6. Gastrointestinal : 1693.0652
7. Abroad travel: 11288.4346
8. Contact with COVID Patient: 8194.1488
9. Attended Large Gathering: 9140.5020
10. Family working in Public Exposed Places: 7781.7278

Training MLP with features selected by 1. Chi-Square
Training fold 1/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Training fold 2/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Training fold 3/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Training fold 4/5...
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<H1>TabNet</H1>

In [35]:
# Dependencies installation (run these commands in your terminal)
# pip install pandas numpy scikit-learn tensorflow keras matplotlib seaborn xgboost lightgbm boruta
# pip install imbalanced-learn statsmodels scipy pytorch-tabnet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import os
import warnings
import torch  # Importing PyTorch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

warnings.filterwarnings('ignore')

# Load dataset
def load_data(file_path):
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    X = df.drop(columns=['COVID-19'])
    y = df['COVID-19']
    print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features")
    print(f"Class distribution: {dict(y.value_counts())}")
    return X, y

# Define all feature selection techniques
def get_feature_selectors(X, y, n_features=10):
    print("Initializing feature selection techniques...")
    feature_selectors = {
        "1. Chi-Square": SelectKBest(chi2, k=n_features),
        "2. Mutual Information": SelectKBest(mutual_info_classif, k=n_features),
        "3. Recursive Feature Elimination": RFE(
            estimator=LogisticRegression(solver='liblinear', max_iter=1000, random_state=42),
            n_features_to_select=n_features
        ),
        "4. Lasso": SelectFromModel(
            Lasso(alpha=0.01, random_state=42), max_features=n_features
        ),
        "5. Random Forest Importance": SelectFromModel(
            RandomForestClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "6. Boruta": BorutaPy(
            RandomForestClassifier(n_estimators=100, random_state=42),
            n_estimators='auto', verbose=0, random_state=42
        ),
        "7. Correlation-based": None,  # Custom implementation
        "8. Sequential Forward Selection": SequentialFeatureSelector(
            RandomForestClassifier(n_estimators=50, random_state=42),
            n_features_to_select=n_features,
            direction='forward'
        ),
        "9. XGBoost Importance": SelectFromModel(
            XGBClassifier(n_estimators=100, random_state=42), max_features=n_features
        ),
        "10. LightGBM Importance": SelectFromModel(
            LGBMClassifier(n_estimators=100, random_state=42), max_features=n_features
        )
    }
    return feature_selectors

# Function to select top features using each technique
def select_features(X, y, technique_name, selector, n_features=10):
    print(f"Selecting features using {technique_name}...")
    feature_names = X.columns.tolist()
    
    # Handle special case for Correlation-based selection
    if technique_name == "7. Correlation-based":
        # Calculate correlation of each feature with target
        correlations = []
        for col in X.columns:
            corr = np.abs(X[col].corr(y))
            correlations.append((col, corr))
        
        # Sort by correlation and select top n_features
        correlations.sort(key=lambda x: x[1], reverse=True)
        selected_features = [item[0] for item in correlations[:n_features]]
        feature_importances = [item[1] for item in correlations[:n_features]]
        
    # Handle special case for Boruta
    elif technique_name == "6. Boruta":
        # Boruta requires array input
        X_array = X.values
        selector.fit(X_array, y)
        
        # Get the selected features
        selected_mask = selector.support_
        ranking = selector.ranking_
        
        # Sort by ranking and select top features
        feature_ranking = [(feature, rank) for feature, rank, mask in 
                          zip(feature_names, ranking, selected_mask) if mask]
        feature_ranking.sort(key=lambda x: x[1])
        
        # If Boruta selected fewer than n_features, add more by ranking
        if len(feature_ranking) < n_features:
            additional = [(f, r) for f, r, m in 
                         zip(feature_names, ranking, selected_mask) if not m]
            additional.sort(key=lambda x: x[1])
            feature_ranking.extend(additional[:n_features-len(feature_ranking)])
        
        feature_ranking = feature_ranking[:n_features]
        selected_features = [item[0] for item in feature_ranking]
        feature_importances = [1.0/item[1] for item in feature_ranking]  # Invert ranking for visualization
    
    else:
        # Standard scikit-learn selectors
        try:
            selector.fit(X, y)
            
            # Different selector types have different ways to get selected features
            if hasattr(selector, 'get_support'):
                selected_mask = selector.get_support()
                selected_features = [f for f, selected in zip(feature_names, selected_mask) if selected]
                
                # Get feature importances if available
                if hasattr(selector, 'estimator_') and hasattr(selector.estimator_, 'feature_importances_'):
                    feature_importances = selector.estimator_.feature_importances_[selected_mask]
                elif hasattr(selector, 'scores_'):
                    feature_importances = selector.scores_[selected_mask]
                else:
                    feature_importances = np.ones(len(selected_features))
                    
            elif hasattr(selector, 'coef_'):
                # For models with coefficients like Lasso
                coefs = np.abs(selector.coef_)
                indices = np.argsort(coefs)[::-1][:n_features]
                selected_features = [feature_names[i] for i in indices]
                feature_importances = [coefs[i] for i in indices]
                
            else:
                # Get features from the model itself
                try:
                    importances = getattr(selector, 'feature_importances_', 
                                         getattr(selector, 'coef_', None))
                    if importances is None:
                        importances = np.ones(len(feature_names))
                    
                    # For 2D coefficients (like in multiclass), take the mean
                    if importances.ndim > 1:
                        importances = np.mean(np.abs(importances), axis=0)
                    
                    # Select top features
                    indices = np.argsort(np.abs(importances))[::-1][:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = [np.abs(importances)[i] for i in indices]
                    
                except:
                    # Fallback for other selectors
                    indices = getattr(selector, 'support_', np.arange(min(n_features, len(feature_names))))
                    if len(indices) > n_features:
                        indices = indices[:n_features]
                    selected_features = [feature_names[i] for i in indices]
                    feature_importances = np.ones(len(selected_features))
        
        except Exception as e:
            print(f"Error with {technique_name}: {str(e)}")
            # Default to the first n_features if there's an error
            selected_features = feature_names[:n_features]
            feature_importances = np.ones(n_features)
    
    # Ensure exactly n_features are selected (truncate or pad if necessary)
    if len(selected_features) > n_features:
        selected_features = selected_features[:n_features]
        feature_importances = feature_importances[:n_features]
    elif len(selected_features) < n_features:
        # Add remaining features based on variance
        remaining = [f for f in feature_names if f not in selected_features]
        selected_features.extend(remaining[:n_features-len(selected_features)])
        feature_importances = list(feature_importances) + [0] * (n_features - len(feature_importances))
    
    # Print selected features
    print(f"Top {len(selected_features)} features selected by {technique_name}:")
    for i, (feature, importance) in enumerate(zip(selected_features, feature_importances)):
        print(f"{i+1}. {feature}: {importance:.4f}")
    
    return selected_features, feature_importances

# Build the TabNet model for a specific set of features
def build_tabnet_model():
    model = TabNetClassifier(
        n_d=8, 
        n_a=8, 
        n_steps=3, 
        gamma=1.3, 
        n_independent=2, 
        n_shared=2, 
        lambda_sparse=1e-5,
        optimizer_fn=torch.optim.Adam,  # Corrected to use the actual optimizer function
        optimizer_params=dict(lr=2e-2),
        mask_type='sparsemax',  # Can be 'sparsemax' or 'entmax'
        scheduler_params={"step_size": 100, "gamma": 0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR  # Use a callable for the scheduler
    )
    return model

# Train and evaluate model with k-fold cross validation
def train_and_evaluate(X, y, selected_features, technique_name, k=5):
    print(f"\nTraining TabNet with features selected by {technique_name}")
    
    # Prepare data for TabNet
    X_selected = X[selected_features].values
    y_values = y.values
    
    # K-Fold validation
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'auc': []
    }
    
    # Train and evaluate for each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_selected)):
        print(f"Training fold {fold+1}/{k}...")
        
        X_train, X_test = X_selected[train_idx], X_selected[test_idx]
        y_train, y_test = y_values[train_idx], y_values[test_idx]
        
        # Build and train model
        model = build_tabnet_model()
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_name=['test'],
            eval_metric=['accuracy', 'auc'],
            max_epochs=100,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False
        )
        
        # Evaluate
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        metrics['accuracy'].append(accuracy_score(y_test, y_pred))
        metrics['precision'].append(precision_score(y_test, y_pred))
        metrics['recall'].append(recall_score(y_test, y_pred))
        metrics['f1'].append(f1_score(y_test, y_pred))
        metrics['auc'].append(roc_auc_score(y_test, y_pred))
    
    # Calculate average metrics
    avg_metrics = {metric: np.mean(values) for metric, values in metrics.items()}
    std_metrics = {metric: np.std(values) for metric, values in metrics.items()}
    
    print(f"\nResults for {technique_name}:")
    for metric, value in avg_metrics.items():
        print(f"Average {metric}: {value:.4f} ± {std_metrics[metric]:.4f}")
    
    return avg_metrics

# Plot comparison bar chart
def plot_comparison(all_results):
    metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'auc']
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    # Set up the figure
    plt.figure(figsize=(15, 10))
    
    # Set width of bars
    bar_width = 0.15
    index = np.arange(len(sorted_techniques))
    
    # Colors for different metrics
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
    
    # Plot bars for each metric
    for i, metric in enumerate(metrics_to_plot):
        values = [all_results[technique][metric] for technique in sorted_techniques]
        plt.bar(
            index + i * bar_width, 
            values, 
            bar_width, 
            label=metric.capitalize(),
            color=colors[i]
        )
    
    # Add labels and legend
    plt.xlabel('Feature Selection Technique', fontsize=12)
    plt.ylabel('Score', fontsize=12)
    plt.title('Comparison of Feature Selection Techniques', fontsize=14)
    plt.xticks(
        index + bar_width * 2, 
        [t.split('. ')[1] if '. ' in t else t for t in sorted_techniques],
        rotation=45,
        ha='right'
    )
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=5)
    plt.ylim(0, 1.0)
    plt.tight_layout()
    
    # Save figure
    plt.savefig('feature_selection_comparison_TabNet.png', dpi=300, bbox_inches='tight')
    print("Comparison chart saved as 'feature_selection_comparison_TabNet.png'")
    plt.close()

# Plot feature heatmap
def plot_feature_heatmap(all_features, X):
    # Create a matrix of features vs techniques
    techniques = list(all_features.keys())
    all_unique_features = list(set(feature for features in all_features.values() for feature in features))
    
    # Create a matrix with 1 if feature is selected by technique, 0 otherwise
    matrix = np.zeros((len(techniques), len(all_unique_features)))
    
    for i, technique in enumerate(techniques):
        for j, feature in enumerate(all_unique_features):
            if feature in all_features[technique]:
                matrix[i, j] = 1
    
    # Sort features by frequency of selection
    feature_counts = matrix.sum(axis=0)
    sorted_indices = np.argsort(feature_counts)[::-1]
    sorted_features = [all_unique_features[i] for i in sorted_indices]
    sorted_matrix = matrix[:, sorted_indices]
    
    # Create heatmap
    plt.figure(figsize=(20, 12))
    sns.heatmap(
        sorted_matrix,
        cmap='Blues',
        xticklabels=sorted_features,
        yticklabels=[t.split('. ')[1] if '. ' in t else t for t in techniques],
        cbar_kws={'label': 'Selected'}
    )
    plt.title('Feature Selection by Different Techniques', fontsize=14)
    plt.tight_layout()
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    
    # Save figure
    plt.savefig('feature_selection_heatmap_TabNet.png', dpi=300, bbox_inches='tight')
    print("Heatmap saved as 'feature_selection_heatmap_TabNet.png'")
    plt.close()

# Main function to run the whole process
def main():
    # Load data
    file_path = "Downloads/preprocessed_merged500_covid_data.csv"  # Update with your path
    X, y = load_data(file_path)
    
    # Get feature selectors
    feature_selectors = get_feature_selectors(X, y)
    
    # Store results
    all_results = {}
    all_selected_features = {}
    
    # For each technique, select features and train model
    for technique_name, selector in feature_selectors.items():
        print("\n" + "="*50)
        print(f"Processing {technique_name}")
        print("="*50)
        
        # Select features
        selected_features, _ = select_features(X, y, technique_name, selector)
        all_selected_features[technique_name] = selected_features
        
        # Train and evaluate
        results = train_and_evaluate(X, y, selected_features, technique_name)
        all_results[technique_name] = results
    
    # Plot comparison
    plot_comparison(all_results)
    
    # Plot feature heatmap
    plot_feature_heatmap(all_selected_features, X)
    
    # Print final summary
    print("\n" + "="*50)
    print("FINAL SUMMARY")
    print("="*50)
    
    # Sort techniques by accuracy
    sorted_techniques = sorted(
        all_results.keys(),
        key=lambda x: all_results[x]['accuracy'],
        reverse=True
    )
    
    print("\nTechniques ranked by accuracy:")
    for i, technique in enumerate(sorted_techniques):
        print(f"{i+1}. {technique}: {all_results[technique]['accuracy']:.4f}")
    
    best_technique = sorted_techniques[0]
    print(f"\nBest performing technique: {best_technique}")
    print(f"Top 10 features selected by {best_technique}:")
    for i, feature in enumerate(all_selected_features[best_technique]):
        print(f"{i+1}. {feature}")

if __name__ == "__main__":
    main()

Loading dataset...
Dataset loaded: 28766 samples, 20 features
Class distribution: {0: 18514, 1: 10252}
Initializing feature selection techniques...

Processing 1. Chi-Square
Selecting features using 1. Chi-Square...
Top 10 features selected by 1. Chi-Square:
1. Breathing Problem: 9964.0393
2. Sore throat: 13201.1649
3. Heart Disease: 3552.0005
4. Diabetes: 1929.1672
5. Hyper Tension: 3089.5719
6. Gastrointestinal : 1693.0652
7. Abroad travel: 11288.4346
8. Contact with COVID Patient: 8194.1488
9. Attended Large Gathering: 9140.5020
10. Family working in Public Exposed Places: 7781.7278

Training TabNet with features selected by 1. Chi-Square
Training fold 1/5...
epoch 0  | loss: 0.24799 | test_accuracy: 0.90615 | test_auc: 0.97363 |  0:00:02s
epoch 1  | loss: 0.0834  | test_accuracy: 0.9178  | test_auc: 0.98947 |  0:00:05s
epoch 2  | loss: 0.07275 | test_accuracy: 0.92353 | test_auc: 0.99245 |  0:00:08s
epoch 3  | loss: 0.05839 | test_accuracy: 0.95447 | test_auc: 0.99543 |  0:00:11s
e