In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from IPython.display import display, HTML
import warnings

# remove warning messages
warnings.filterwarnings("ignore")

# Load the malware dataset from a CSV file
df = pd.read_csv('malware_MultiClass.csv')

# Print the shape (number of rows and columns) of the dataset
print(df.shape)

# Count the occurrences of each class in the 'classification' column
class_counts = df['classification'].value_counts()
print(class_counts)

# Remove leading and trailing spaces from column names using lambda function and rename columns
df = df.rename(columns=lambda x: x.strip())

# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

(100000, 36)
malware    49871
benign     49858
unknown      271
Name: classification, dtype: int64


Unnamed: 0,hash,millisecond,classification,os,state,usage_counter,prio,static_prio,normal_prio,policy,vm_pgoff,vm_truncate_count,task_size,cached_hole_size,free_area_cache,mm_users,map_count,hiwater_rss,total_vm,shared_vm,exec_vm,reserved_vm,nr_ptes,end_data,last_interval,nvcsw,nivcsw,min_flt,maj_flt,fs_excl_counter,lock,utime,stime,gtime,cgtime,signal_nvcsw
0,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,0,malware,CentOS,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
1,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,1,malware,Windows,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
2,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,2,malware,Mac,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
3,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,3,malware,Ubuntu,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
4,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,4,malware,Mac,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
5,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,5,malware,Windows,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
6,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,6,malware,Ubuntu,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
7,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,7,malware,Mac,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
8,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,8,malware,CentOS,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
9,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914e223349672eca79ad0,9,malware,Mac,0,0,3069378560,14274,0,0,0,13173,0,0,25,724,6852,0,150,120,124,211,0,120,3473,341974,0,130,120,0,3204448256,380690,4,0,0,0


In [2]:
# Select the 19 columns of interest from the DataFrame
selected_columns = [
    'classification', 'os', 'usage_counter', 'prio', 'static_prio', 'normal_prio',
    'vm_pgoff', 'vm_truncate_count', 'task_size', 'map_count', 'hiwater_rss',
    'total_vm', 'shared_vm', 'exec_vm', 'reserved_vm', 'nr_ptes', 'nvcsw',
    'nivcsw', 'signal_nvcsw']

# Create a new DataFrame with only these selected columns
df = df[selected_columns]  

# check whether there are missing values
cols=df.columns
print("Missing values?", df.isnull().any().any())

# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

# encode labels
y = df['classification'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers
df['classification'] = y_encoded

# Print the datatypes of the columns in the DataFrame
print('Column Datatypes:\n',df.dtypes)
print()

# How to convert all nominal variables to binary variables
df_num=df.copy(deep=True)
df_dummies=pd.get_dummies(df_num[['os']]) # create new binary columns
df_num=df_num.join(df_dummies) # add them to dataframe
df_num=df_num.drop('os', axis=1) # drop original columns

# Check if all columns are numerical
print('Column Datatypes:\n',df_num.dtypes)

# drop extra binary columns, since we only need N-1 binary columns
df_num=df_num.drop('os_CentOS', axis=1)

# print out and display dataframe as tables in HTML
display('df_num:',HTML(df_num.head(10).to_html()))

Missing values? False


Unnamed: 0,classification,os,usage_counter,prio,static_prio,normal_prio,vm_pgoff,vm_truncate_count,task_size,map_count,hiwater_rss,total_vm,shared_vm,exec_vm,reserved_vm,nr_ptes,nvcsw,nivcsw,signal_nvcsw
0,malware,CentOS,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
1,malware,Windows,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
2,malware,Mac,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
3,malware,Ubuntu,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
4,malware,Mac,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
5,malware,Windows,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
6,malware,Ubuntu,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
7,malware,Mac,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
8,malware,CentOS,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0
9,malware,Mac,0,3069378560,14274,0,0,13173,0,6852,0,150,120,124,211,0,341974,0,0


Column Datatypes:
 classification        int32
os                   object
usage_counter         int64
prio                  int64
static_prio           int64
normal_prio           int64
vm_pgoff              int64
vm_truncate_count     int64
task_size             int64
map_count             int64
hiwater_rss           int64
total_vm              int64
shared_vm             int64
exec_vm               int64
reserved_vm           int64
nr_ptes               int64
nvcsw                 int64
nivcsw                int64
signal_nvcsw          int64
dtype: object

Column Datatypes:
 classification       int32
usage_counter        int64
prio                 int64
static_prio          int64
normal_prio          int64
vm_pgoff             int64
vm_truncate_count    int64
task_size            int64
map_count            int64
hiwater_rss          int64
total_vm             int64
shared_vm            int64
exec_vm              int64
reserved_vm          int64
nr_ptes              int64
nvcsw     

'df_num:'

Unnamed: 0,classification,usage_counter,prio,static_prio,normal_prio,vm_pgoff,vm_truncate_count,task_size,map_count,hiwater_rss,total_vm,shared_vm,exec_vm,reserved_vm,nr_ptes,nvcsw,nivcsw,signal_nvcsw,os_Debian,os_Mac,os_Ubuntu,os_Windows
0,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,0,0,0
1,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,0,0,1
2,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,1,0,0
3,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,0,1,0
4,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,1,0,0
5,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,0,0,1
6,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,0,1,0
7,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,1,0,0
8,1,0,3069378560,14274,0,0,13173,0,6850,0,150,120,124,210,0,341974,0,0,0,0,0,0
9,1,0,3069378560,14274,0,0,13173,0,6852,0,150,120,124,211,0,341974,0,0,0,1,0,0


## All Features

In [12]:
# Find the best parameters using the all features
# By using all columns from the df_num

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# Define the parameter combinations to test
penalties = ['l1', 'l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'saga']
max_iters = [100, 150, 200]
Cs = [0.1, 1.0, 10.0]
fit_intercepts = [True, False]

# Define the feature and target variables
x = df_num.drop('classification', axis=1)
y = df_num['classification']

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Initialize variables to track the best F1 score and AUC
best_f1 = 0
best_auc = 0
best_params = {}

# Iterate through parameter combinations
for penalty in penalties:
    for solver in solvers:
        # Check if the combination is valid
        if (penalty == 'l1' and solver not in ['liblinear', 'saga']) or \
           (penalty == 'elasticnet' and solver not in ['saga']) or \
           (penalty == 'l2' and solver == 'lbfgs'):
            continue

        for max_iter in max_iters:
            # liblinear solver doesn't support elasticnet penalty
            if solver == 'liblinear' and penalty == 'l1':
                continue
            
            for C in Cs:
                for fit_intercept in fit_intercepts:
                    # Create and fit a logistic regression model
                    clf = LogisticRegression(
                        penalty=penalty,
                        solver=solver,
                        max_iter=max_iter,
                        C=C,
                        fit_intercept=fit_intercept,
                        random_state=42
                    )
                    clf = clf.fit(x_train, y_train)
                    y_pred = clf.predict(x_test)
                    f1 = f1_score(y_test, y_pred, average='micro')

                    # Use LabelBinarizer to handle multiclass classification for AUC
                    lb = LabelBinarizer()
                    y_test_bin = lb.fit_transform(y_test)
                    y_pred_proba = clf.predict_proba(x_test)
                    auc = roc_auc_score(y_test_bin, y_pred_proba, average='macro')
                    
                    # Update best F1 score, AUC, and parameters if a better model is found
                    if f1 > best_f1 and auc > best_auc:
                        best_f1 = f1
                        best_auc = auc
                        best_params = {
                            'penalty': penalty,
                            'solver': solver,
                            'max_iter': max_iter,
                            'C': C,
                            'fit_intercept': fit_intercept
                        }

# Print the best parameters and corresponding F1 score and AUC
print("Best Parameters:")
print(best_params)

Best Parameters:
{'penalty': 'l2', 'solver': 'newton-cg', 'max_iter': 200, 'C': 1.0, 'fit_intercept': False}


In [26]:
# All features model with the best parameters

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, roc_auc_score

# Define the feature and target variables
x = df_num.drop('classification', axis=1)
y = df_num['classification']

# Create a logistic regression model with the best parameters
clf = LogisticRegression(penalty='l2', solver='newton-cg', max_iter=200, C=1.0, fit_intercept=False)

# Perform 5-fold cross-validation and calculate accuracy, F1 score, and AUC
f1_scores = cross_val_score(clf, x, y, cv=5, scoring='f1_micro')
auc_scores = cross_val_score(clf, x, y, cv=5, scoring='roc_auc_ovr')

# Calculate and print the mean values
mean_f1 = np.mean(f1_scores)
mean_auc = np.mean(auc_scores)

print('By 5-fold cross-validation with all features: ')
print("F1 =", mean_f1)
print("AUC =", mean_auc)

By 5-fold cross-validation with all features: 
F1 = 0.66122
AUC = 0.7261120100759924


## Feature Selection (Forward Method)

In [16]:
# Perform feature selection with forward method

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector

# search method (forward)
feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),
           n_features_to_select='auto',
           direction='forward',
           scoring='accuracy',
           cv=5)

x = df_num.drop('classification', axis=1)
y = df_num['classification']

feature_selector.fit(x, y)
selected_features= feature_selector.get_support()
print(x.columns)
print(selected_features)

# Print the selected feautres after forward method
selected_columns = x.columns[selected_features]
print("\nThe selected features after forward method:\n", selected_columns)

Index(['usage_counter', 'prio', 'static_prio', 'normal_prio', 'vm_pgoff',
       'vm_truncate_count', 'task_size', 'map_count', 'hiwater_rss',
       'total_vm', 'shared_vm', 'exec_vm', 'reserved_vm', 'nr_ptes', 'nvcsw',
       'nivcsw', 'signal_nvcsw', 'os_Debian', 'os_Mac', 'os_Ubuntu',
       'os_Windows'],
      dtype='object')
[ True False  True  True False  True False  True False  True  True  True
 False  True False False False False  True False False]

The selected features after forward method:
 Index(['usage_counter', 'static_prio', 'normal_prio', 'vm_truncate_count',
       'map_count', 'total_vm', 'shared_vm', 'exec_vm', 'nr_ptes', 'os_Mac'],
      dtype='object')


In [17]:
# Define the columns to keep after the feature selection
selected_columns = ['classification', 'usage_counter', 'static_prio', 'normal_prio', 'vm_truncate_count',
       'map_count', 'total_vm', 'shared_vm', 'exec_vm', 'nr_ptes', 'os_Mac']

# Create a new DataFrame with the selected features
df_num_feature = df_num.copy(deep=True)
df_num_feature = df_num[selected_columns]

# Print out and display the updated dataframe as tables in HTML
display('df_num_feature:', HTML(df_num_feature.head(10).to_html()))

'df_num_feature:'

Unnamed: 0,classification,usage_counter,static_prio,normal_prio,vm_truncate_count,map_count,total_vm,shared_vm,exec_vm,nr_ptes,os_Mac
0,1,0,14274,0,13173,6850,150,120,124,0,0
1,1,0,14274,0,13173,6850,150,120,124,0,0
2,1,0,14274,0,13173,6850,150,120,124,0,1
3,1,0,14274,0,13173,6850,150,120,124,0,0
4,1,0,14274,0,13173,6850,150,120,124,0,1
5,1,0,14274,0,13173,6850,150,120,124,0,0
6,1,0,14274,0,13173,6850,150,120,124,0,0
7,1,0,14274,0,13173,6850,150,120,124,0,1
8,1,0,14274,0,13173,6850,150,120,124,0,0
9,1,0,14274,0,13173,6852,150,120,124,0,1


In [18]:
# Find the best parameters using the feature selection (forward)
# By using selected columns which is df_num_feature

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# Define the parameter combinations to test
penalties = ['l1', 'l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'saga']
max_iters = [100, 150, 200]
Cs = [0.1, 1.0, 10.0]
fit_intercepts = [True, False]

# Define the feature and target variables using df_num_feature
x = df_num_feature.drop('classification', axis=1)
y = df_num_feature['classification']

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Initialize variables to track the best F1 score and AUC
best_f1 = 0
best_auc = 0
best_params = {}

# Iterate through parameter combinations
for penalty in penalties:
    for solver in solvers:
        # Check if the combination is valid
        if (penalty == 'l1' and solver not in ['liblinear', 'saga']) or \
           (penalty == 'elasticnet' and solver not in ['saga']) or \
           (penalty == 'l2' and solver == 'lbfgs'):
            continue

        for max_iter in max_iters:
            # liblinear solver doesn't support elasticnet penalty
            if solver == 'liblinear' and penalty == 'l1':
                continue
            
            for C in Cs:
                for fit_intercept in fit_intercepts:
                    # Create and fit a logistic regression model
                    clf = LogisticRegression(
                        penalty=penalty,
                        solver=solver,
                        max_iter=max_iter,
                        C=C,
                        fit_intercept=fit_intercept,
                        random_state=42
                    )
                    clf = clf.fit(x_train, y_train)
                    y_pred = clf.predict(x_test)
                    f1 = f1_score(y_test, y_pred, average='micro')

                    # Use LabelBinarizer to handle multiclass classification for AUC
                    lb = LabelBinarizer()
                    y_test_bin = lb.fit_transform(y_test)
                    y_pred_proba = clf.predict_proba(x_test)
                    auc = roc_auc_score(y_test_bin, y_pred_proba, average='macro')
                    
                    # Update best F1 score, AUC, and parameters if a better model is found
                    if f1 > best_f1 and auc > best_auc:
                        best_f1 = f1
                        best_auc = auc
                        best_params = {
                            'penalty': penalty,
                            'solver': solver,
                            'max_iter': max_iter,
                            'C': C,
                            'fit_intercept': fit_intercept
                        }

# Print the best parameters and corresponding F1 score and AUC
print("Best Parameters:")
print(best_params)

Best Parameters:
{'penalty': 'l2', 'solver': 'newton-cg', 'max_iter': 200, 'C': 1.0, 'fit_intercept': True}


In [25]:
# Feature Selection Model with the best parameters

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Define the feature and target variables using df_num_feature
x = df_num_feature.drop('classification', axis=1)
y = df_num_feature['classification']  # Assuming 'classification' is the target variable in df_num

# Create a logistic regression model with the best parameters
clf = LogisticRegression(penalty='l2', solver='newton-cg', max_iter=200, C=1.0, fit_intercept=True)

# Perform 5-fold cross-validation and calculate accuracy, F1 score, and AUC
f1_scores = cross_val_score(clf, x, y, cv=5, scoring='f1_micro')
auc_scores = cross_val_score(clf, x, y, cv=5, scoring='roc_auc_ovr')

# Calculate and print the mean values
mean_f1 = np.mean(f1_scores)
mean_auc = np.mean(auc_scores)

print('By 5-fold cross-validation with feature selection: ')
print("F1 =", mean_f1)
print("AUC =", mean_auc)

By 5-fold cross-validation with feature selection: 
F1 = 0.8502700000000001
AUC = 0.7808820990909792


## Feature Reduction (PCA)

In [27]:
# Perform feature reduction with PCA

from sklearn.decomposition import PCA

x = df_num.drop('classification', axis=1)
y = df_num['classification']

# feature extraction
pca = PCA(n_components=10)
fit = pca.fit(x)

# summarize components
# print("Explained Variance: %s") % fit.explained_variance_ratio_
print('\nExplained variance: ', fit.explained_variance_ratio_)

var_sums = []
var_sums.append(0)

for var in fit.explained_variance_ratio_:
    l = len(var_sums)
    v_sum = var_sums[l-1] + var
    var_sums.append(v_sum)

var_sums.remove(0)
print('\nAccmulated variance: ', var_sums)   

PCAs = pca.fit_transform(x)

# Select the top 3 principal components since they already cover 99.99% of the total variance 
PCAs_selected = PCAs[:,:3]
df_PCAs = pd.DataFrame(data=PCAs_selected, columns=['PC1','PC2', 'PC3'])
df_PCAs['classification']=y

display(HTML(df_PCAs.head(10).to_html()))


Explained variance:  [9.98554164e-01 1.27593111e-03 9.22007894e-05 6.06301416e-05
 1.62585779e-05 7.19160153e-07 7.10066343e-08 2.42617651e-08
 4.62245567e-10 5.68750674e-11]

Accmulated variance:  [0.998554164430876, 0.9998300955360926, 0.999922296325527, 0.9999829264671727, 0.9999991850450467, 0.9999999042051996, 0.9999999752118338, 0.999999999473599, 0.9999999999358445, 0.9999999999927196]


Unnamed: 0,PC1,PC2,PC3,classification
0,-327392.362379,-9456.50461,-823.359813,1
1,-327392.362379,-9456.50461,-823.359812,1
2,-327392.362379,-9456.504609,-823.359813,1
3,-327392.362379,-9456.50461,-823.359812,1
4,-327392.362379,-9456.504609,-823.359813,1
5,-327392.362379,-9456.50461,-823.359812,1
6,-327392.362379,-9456.50461,-823.359812,1
7,-327392.362379,-9456.504609,-823.359813,1
8,-327392.362379,-9456.50461,-823.359813,1
9,-327392.367965,-9455.983098,-821.65382,1


In [28]:
# Find the best parameters using the feature selection (forward)
# By using selected columns which is df_num_feature

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# Define the parameter combinations to test
penalties = ['l1', 'l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'saga']
max_iters = [100, 150, 200]
Cs = [0.1, 1.0, 10.0]
fit_intercepts = [True, False]

# Define the feature and target variables using df_PCAs
x = df_PCAs.drop('classification', axis=1)
y = df_PCAs['classification']

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Initialize variables to track the best F1 score and AUC
best_f1 = 0
best_auc = 0
best_params = {}

# Iterate through parameter combinations
for penalty in penalties:
    for solver in solvers:
        # Check if the combination is valid
        if (penalty == 'l1' and solver not in ['liblinear', 'saga']) or \
           (penalty == 'elasticnet' and solver not in ['saga']) or \
           (penalty == 'l2' and solver == 'lbfgs'):
            continue

        for max_iter in max_iters:
            # liblinear solver doesn't support elasticnet penalty
            if solver == 'liblinear' and penalty == 'l1':
                continue
            
            for C in Cs:
                for fit_intercept in fit_intercepts:
                    # Create and fit a logistic regression model
                    clf = LogisticRegression(
                        penalty=penalty,
                        solver=solver,
                        max_iter=max_iter,
                        C=C,
                        fit_intercept=fit_intercept,
                        random_state=42
                    )
                    clf = clf.fit(x_train, y_train)
                    y_pred = clf.predict(x_test)
                    f1 = f1_score(y_test, y_pred, average='micro')

                    # Use LabelBinarizer to handle multiclass classification for AUC
                    lb = LabelBinarizer()
                    y_test_bin = lb.fit_transform(y_test)
                    y_pred_proba = clf.predict_proba(x_test)
                    auc = roc_auc_score(y_test_bin, y_pred_proba, average='macro')
                    
                    # Update best F1 score, AUC, and parameters if a better model is found
                    if f1 > best_f1 and auc > best_auc:
                        best_f1 = f1
                        best_auc = auc
                        best_params = {
                            'penalty': penalty,
                            'solver': solver,
                            'max_iter': max_iter,
                            'C': C,
                            'fit_intercept': fit_intercept
                        }

# Print the best parameters and corresponding F1 score and AUC
print("Best Parameters:")
print(best_params)

Best Parameters:
{'penalty': 'l2', 'solver': 'newton-cg', 'max_iter': 150, 'C': 0.1, 'fit_intercept': True}


In [29]:
# Define the feature and target variables using df_PCAs
x = df_PCAs.drop('classification', axis=1)
y = df_PCAs['classification']

# Define the logistic regression model with the best parameters
clf = LogisticRegression(penalty='l2', solver='newton-cg', max_iter=150, C=0.1, fit_intercept=True)

# Perform 5-fold cross-validation and calculate F1 score and AUC
f1_scores = cross_val_score(clf, x, y, cv=5, scoring='f1_micro')
auc_scores = cross_val_score(clf, x, y, cv=5, scoring='roc_auc_ovr')

# Calculate and print the mean values
mean_f1 = np.mean(f1_scores)
mean_auc = np.mean(auc_scores)

print('By 5-fold cross-validation with PCA: ')
print("F1 =", mean_f1)
print("AUC =", mean_auc)

By 5-fold cross-validation with PCA: 
F1 = 0.5924400000000001
AUC = 0.6599022208572196


# Result

### All Feature
F1 = 0.66122  
AUC = 0.7261120100759924

### Feature Selection (by forward method)
F1 = 0.8502700000000001  
AUC = 0.7808820990909792

### Feature Reduction (by PCA)
F1 = 0.5924400000000001  
AUC = 0.6599022208572196

#### In conclusion, the best model is the Feature Selection.