In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tabulate import tabulate

# Load Data
train_df = pd.read_csv("train_data-1.csv")
test_df = pd.read_csv("test_data-1.csv")

# Extract features and labels
X_train = train_df.drop(columns=["class"])
y_train = train_df["class"]
X_test = test_df.drop(columns=["class"])
y_test = test_df["class"]

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Bagging": BaggingClassifier()
}

# Grid Params
param = {
    "Decision Tree": {"max_depth": [2, 4, 6, 8, 10, None]},
    "Random Forest": {"n_estimators": [10, 50, 100], "max_depth": [4, 8, None]},
    "Bagging": {"n_estimators": [10, 50, 100]}
}

# Tune Models
tuned_models = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param[model_name], cv=5)
    grid_search.fit(X_train_scaled, y_train)
    tuned_models[model_name] = grid_search.best_estimator_

# CROSS VALIDATION FOR EACH FEATURE
feat_score = {}
for feature in X_train.columns:
    X_feature = X_train[[feature]]  # Use only one feature
    X_feat_scaled = scaler.fit_transform(X_feature)

    scores = {}
    for model_name, model in tuned_models.items():
        score = np.mean(cross_val_score(model, X_feat_scaled, y_train, cv=5))
        scores[model_name] = score

    feat_score[feature] = scores

# Best Feature for Each Model
best_features = {model: max(feat_score, key=lambda f: feat_score[f][model]) for model in tuned_models}

# Use Best Feature to Train Models
results_single_feature = {}
for model_name, feature in best_features.items():
    model = tuned_models[model_name]
    model.fit(scaler.fit_transform(X_train[[feature]]), y_train)
    accuracy = model.score(scaler.transform(X_test[[feature]]), y_test)
    results_single_feature[model_name] = (feature, accuracy)

# TRAIN MODELS USING ALL FEATURES
all_feature_result = {}
for model_name, model in tuned_models.items():
    model.fit(X_train_scaled, y_train)
    accuracy = model.score(X_test_scaled, y_test)
    all_feature_result[model_name] = accuracy

# TABULATE PRINT RESULTS SECTION

# Print table for best feature results
best_single_feature_result = []
for model, (feature, accuracy) in results_single_feature.items():
    best_single_feature_result.append([model, feature, f"{accuracy:.4f}"])

print("Results using the best single feature:")
print(tabulate(best_single_feature_result, headers=["Model", "Best Feature", "Accuracy"], tablefmt="grid"))

# Print table for all feature trained model results
all_feature_results = []
for model, accuracy in all_feature_result.items():
    all_feature_results.append([model, f"{accuracy:.4f}"])

print("\nResults using all features:")
print(tabulate(all_feature_results, headers=["Model", "Accuracy"], tablefmt="grid"))

Results using the best single feature:
+---------------+----------------+------------+
| Model         | Best Feature   |   Accuracy |
| Decision Tree | f7             |     0.6346 |
+---------------+----------------+------------+
| Random Forest | f7             |     0.6301 |
+---------------+----------------+------------+
| Bagging       | f1             |     0.5882 |
+---------------+----------------+------------+

Results using all features:
+---------------+------------+
| Model         |   Accuracy |
| Decision Tree |     0.6923 |
+---------------+------------+
| Random Forest |     0.7319 |
+---------------+------------+
| Bagging       |     0.7059 |
+---------------+------------+


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from tabulate import tabulate

# Load Data
train_df = pd.read_csv("train_data-1.csv")
test_df = pd.read_csv("test_data-1.csv")

# Extract features and labels
X_train = train_df.drop(columns=["class"])
y_train = train_df["class"]
X_test = test_df.drop(columns=["class"])
y_test = test_df["class"]

# Define Models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Bagging": BaggingClassifier()
}

# Grid Search Parameters
param_grid = {
    "Decision Tree": {"criterion": ["gini", "entropy", "log_loss"], "max_depth": [2, 4, 6, 8, 10, None]},
    "Random Forest": {"n_estimators": [10, 50, 100, 150, 200], "max_depth": [4, 8, None]},
    "Bagging": {"n_estimators": [10, 50, 100, 150]}
}

# ---- PART 1: BEST SINGLE FEATURE PER MODEL (TRAINING SET ONLY) ---- #

tuned_models = {}
best_features = {}
results_single_feature = {}

for model_name, model in models.items():
    best_score = -1
    best_feature = None
    best_model = None

    for feature in X_train.columns:
        X_feat_train = X_train[[feature]]
        grid = GridSearchCV(model, param_grid[model_name], cv=5)
        grid.fit(X_feat_train, y_train)
        score = np.mean(cross_val_score(grid.best_estimator_, X_feat_train, y_train, cv=5))

        if score > best_score:
            best_score = score
            best_feature = feature
            best_model = grid.best_estimator_

    tuned_models[model_name] = best_model
    best_features[model_name] = best_feature

    best_model.fit(X_train[[best_feature]], y_train)
    test_acc = best_model.score(X_test[[best_feature]], y_test)
    results_single_feature[model_name] = (best_feature, test_acc)

# ---- PART 2: ALL FEATURES MODEL (TRAINING SET ONLY) ---- #

results_all_features = {}

for model_name, model in models.items():
    grid = GridSearchCV(model, param_grid[model_name], cv=5)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    test_acc = best_model.score(X_test, y_test)
    results_all_features[model_name] = test_acc

# ---- DISPLAY RESULTS ---- #

# Table: Single Best Feature Accuracy
single_feat_table = []
for model, (feature, acc) in results_single_feature.items():
    single_feat_table.append([model, feature, f"{acc:.4f}"])

print("Results using the best single feature:")
print(tabulate(single_feat_table, headers=["Model", "Best Feature", "Test Accuracy"], tablefmt="grid"))

# Table: All Features Accuracy
all_feat_table = []
for model, acc in results_all_features.items():
    all_feat_table.append([model, f"{acc:.4f}"])

print("\nResults using all features:")
print(tabulate(all_feat_table, headers=["Model", "Test Accuracy"], tablefmt="grid"))

Results using the best single feature:
+---------------+----------------+-----------------+
| Model         | Best Feature   |   Test Accuracy |
| Decision Tree | f7             |          0.6403 |
+---------------+----------------+-----------------+
| Random Forest | f7             |          0.6403 |
+---------------+----------------+-----------------+
| Bagging       | f1             |          0.5995 |
+---------------+----------------+-----------------+

Results using all features:
+---------------+-----------------+
| Model         |   Test Accuracy |
| Decision Tree |          0.7025 |
+---------------+-----------------+
| Random Forest |          0.7274 |
+---------------+-----------------+
| Bagging       |          0.7081 |
+---------------+-----------------+


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tabulate import tabulate

# Load Data
train_df = pd.read_csv("train_data-1.csv")
test_df = pd.read_csv("test_data-1.csv")

# Extract features and labels
X_train = train_df.drop(columns=["class"])
y_train = train_df["class"]
X_test = test_df.drop(columns=["class"])
y_test = test_df["class"]

# Scale Features
scaler = StandardScaler()
X_train_scaled = X_train
X_test_scaled = X_test

# Define Models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Bagging": BaggingClassifier()
}

# Grid Search Parameters
param_grid = {
    "Decision Tree": {"criterion": ["gini", "entropy", "log_loss"], "max_depth": [2, 4, 6, 8, 10, None]},
    "Random Forest": {"n_estimators": [10, 50, 100, 150, 200], "max_depth": [4, 8, None]},
    "Bagging": {"n_estimators": [10, 50, 100, 150]}
}

# Tune Models and store best parameters
tuned_models = {}
best_params = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param[model_name], cv=5)
    grid_search.fit(X_train_scaled, y_train)
    tuned_models[model_name] = grid_search.best_estimator_
    best_params[model_name] = grid_search.best_params_

# CROSS VALIDATION FOR EACH FEATURE
feat_score = {}
for feature in X_train.columns:
    X_feature = X_train[[feature]]  # Use only one feature
    
    scores = {}
    for model_name, model in tuned_models.items():
        score = np.mean(cross_val_score(model, X_feature, y_train, cv=5))
        scores[model_name] = score
    
    feat_score[feature] = scores

# Best Feature for Each Model
best_features = {model: max(feat_score, key=lambda f: feat_score[f][model]) for model in tuned_models}

# Use Best Feature to Train Models
results_single_feature = {}
for model_name, feature in best_features.items():
    model = tuned_models[model_name]
    model.fit(X_train[[feature]], y_train)
    accuracy = model.score(X_test[[feature]], y_test)
    results_single_feature[model_name] = (feature, accuracy)

# TRAIN MODELS USING ALL FEATURES
all_feature_result = {}
for model_name, model in tuned_models.items():
    model.fit(X_train_scaled, y_train)
    accuracy = model.score(X_test_scaled, y_test)
    all_feature_result[model_name] = accuracy

# TABULATE PRINT RESULTS SECTION

# Print table for best feature results
best_single_feature_result = []
for model, (feature, accuracy) in results_single_feature.items():
    best_single_feature_result.append([model, feature, f"{accuracy:.4f}"])

# Print table for all feature trained model results
all_feature_results = []
for model, accuracy in all_feature_result.items():
    all_feature_results.append([model, f"{accuracy:.4f}"])

# Print table for best parameters
best_param_results = []
for model, params in best_params.items():
    param_str = ', '.join([f"{key}: {value}" for key, value in params.items()])
    best_param_results.append([model, param_str])

# Print table for best single feature results
print("Results using the best single feature:")
print(tabulate(best_single_feature_result, headers=["Model", "Best Feature", "Accuracy"], tablefmt="grid"))

# Print table for all feature trained model results
print("\nResults using all features:")
print(tabulate(all_feature_results, headers=["Model", "Accuracy"], tablefmt="grid"))

# Print table for best parameters for each model
print("\nBest Parameters for each model:")
print(tabulate(best_param_results, headers=["Model", "Best Parameters"], tablefmt="grid"))