In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from tqdm import tqdm
import time

In [2]:
df = pd.read_csv(r'')

In [3]:
classifiers = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss', random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=300, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
}

In [4]:
X = df.drop(columns=['class'])
y = df['class']

label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

## Correlation Coefficient

In [None]:
correlation_matrix = df.corr()

start_time = time.time()

target_corr = correlation_matrix['class'].drop('class')

threshold = 0.15

selected_features = target_corr[abs(target_corr) > threshold]

filtered_data = df[selected_features.index.tolist() + ['class']]

filtered_data_path = 'filtered_data.csv'

X_filtered_correlation = filtered_data.drop(columns=['class'])
y_filtered_correlation = filtered_data['class']

scaler = StandardScaler()
X_filtered_correlation = scaler.fit_transform(X_filtered_correlation)

results = {}

for name, clf in classifiers.items():
    accuracies = []
    f1_scores = []
    for _ in tqdm(range(1)):
        X_train, X_test, y_train, y_test = train_test_split(X_filtered_correlation, y_filtered_correlation, test_size=0.3, random_state=None, shuffle=True)

        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
        accuracies.append(accuracy_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    min_accuracy = np.min(accuracies)
    max_accuracy = np.max(accuracies)
    avg_accuracy = np.mean(accuracies)
    avg_f1_score = np.mean(f1_scores)
    results[name] = (min_accuracy, max_accuracy, avg_accuracy, avg_f1_score)
        
    #print(f'{name} - Min Accuracy: {min_accuracy:.4f}, Max Accuracy: {max_accuracy:.4f}, Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1_score:.4f}')
    print(f'{name} - Average Accuracy: {avg_accuracy:.4f}')
    
end_time = time.time()
print(f'{start_time - end_time}')
selected_features.shape

## Chi-square test

In [None]:
X_discretized = X.copy()
for col in X_discretized.columns:
    X_discretized[col] = pd.qcut(X_discretized[col], q=10, duplicates='drop').astype(str)

X_encoded = X_discretized.copy()
for col in X_encoded.columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])

chi2_selector = SelectKBest(chi2, k='all')
chi2_selector.fit(X_encoded, y)

p_values = pd.Series(chi2_selector.pvalues_, index=X.columns)

significance_level_chi_square = 0.05

selected_features = p_values[p_values < significance_level_chi_square].index

filtered_df_chi_square = df[selected_features.tolist() + ['class']]

X_filtered_chi_square = filtered_df_chi_square.drop(columns=['class'])
y_filtered_chi_square = filtered_df_chi_square['class']

scaler = StandardScaler()
X_filtered_chi_square = scaler.fit_transform(X_filtered_chi_square)


results = {}

for name, clf in classifiers.items():
    accuracies = []
    f1_scores = []
    for _ in tqdm(range(1)):  
       
        X_train, X_test, y_train, y_test = train_test_split(X_filtered_chi_square, y_filtered_chi_square, test_size=0.3, random_state=None, shuffle=True)

        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
        accuracies.append(accuracy_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    min_accuracy = np.min(accuracies)
    max_accuracy = np.max(accuracies)
    avg_accuracy = np.mean(accuracies)
    avg_f1_score = np.mean(f1_scores)
    results[name] = (min_accuracy, max_accuracy, avg_accuracy, avg_f1_score)
    
    print(f'{name} - Average Accuracy: {avg_accuracy:.4f}')	
    print(f'{name} - Min Accuracy: {min_accuracy:.4f}, Max Accuracy: {max_accuracy:.4f}, Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1_score:.4f}')

## ANOVA F-value

In [None]:
f_selector = SelectKBest(f_classif, k='all')
f_selector.fit(X, y)

p_values_anova = pd.Series(f_selector.pvalues_, index=X.columns)

significance_level_anova = 0.05

selected_features = p_values_anova[p_values_anova < significance_level_anova].index

filtered_df_anova = df[selected_features.tolist() + ['class']]

X_filtered_anova = filtered_df_anova.drop(columns=['class'])
y_filtered_anova = filtered_df_anova['class']

scaler = StandardScaler()
X_filtered_anova = scaler.fit_transform(X_filtered_anova)

results = {}

for name, clf in classifiers.items():
    accuracies = []
    f1_scores = []
    for _ in tqdm(range(1)):  
        X_train, X_test, y_train, y_test = train_test_split(X_filtered_anova, y_filtered_anova, test_size=0.3, random_state=None, shuffle=True)

        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
        accuracies.append(accuracy_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))

    min_accuracy = np.min(accuracies)
    max_accuracy = np.max(accuracies)
    avg_accuracy = np.mean(accuracies)
    avg_f1_score = np.mean(f1_scores)
    results[name] = (min_accuracy, max_accuracy, avg_accuracy, avg_f1_score)
    
    print(f'{name} - Average Accuracy: {avg_accuracy:.4f}')
    print(f'{name} - Min Accuracy: {min_accuracy:.4f}, Max Accuracy: {max_accuracy:.4f}, Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1_score:.4f}')