**Midterm Part 2**

*CS 4319*

*Seth Tourish*

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tabulate import tabulate

# Load Data
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# Extract features and labels
X_train = train_df.drop(columns=["class"])
y_train = train_df["class"]
X_test = test_df.drop(columns=["class"])
y_test = test_df["class"]

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Models
models = {
    "MLP": MLPClassifier(max_iter=1000, early_stopping=True),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier()
}

# Grid Params
param = {
    "MLP": {"hidden_layer_sizes": [(2,), (5,), (10,), (15,), (20,), (50,), (50, 50), (100, 100), (200, 200), (100, 100, 100), (200, 200, 200), (500, 500, 500), (1000, 1000, 1000)]},
    "Logistic Regression": {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
    "KNN": {"n_neighbors": range(1, 21)}
}

# Tune Models
tuned_models = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param[model_name], cv=5)
    grid_search.fit(X_train_scaled, y_train)
    tuned_models[model_name] = grid_search.best_estimator_

# CROSS VALIDATION FOR EACH FEATURE
feat_score = {}
for feature in X_train.columns:
    X_feature = X_train[[feature]]  # Use only one feature
    
    scores = {}
    for model_name, model in tuned_models.items():
        score = np.mean(cross_val_score(model, X_feature, y_train, cv=5))
        scores[model_name] = score
    
    feat_score[feature] = scores

# Best Feature for Each Model
best_features = {model: max(feat_score, key=lambda f: feat_score[f][model]) for model in tuned_models}

# Use Best Feature to Train Models
results_single_feature = {}
for model_name, feature in best_features.items():
    model = tuned_models[model_name]
    model.fit(X_train[[feature]], y_train)
    accuracy = model.score(X_test[[feature]], y_test)
    results_single_feature[model_name] = (feature, accuracy)

# TRAIN MODELS USING ALL FEATURES
all_feature_result = {}
for model_name, model in tuned_models.items():
    model.fit(X_train_scaled, y_train)
    accuracy = model.score(X_test_scaled, y_test)
    all_feature_result
[model_name] = accuracy

# TABULATE PRINT RESULTS SECTION

# Print table for best feature results
best_single_feature_result = []
for model, (feature, accuracy) in results_single_feature.items():
    best_single_feature_result.append([model, feature, f"{accuracy:.4f}"])

# Print table for best feature results
print("Results using the best single feature:")
print(tabulate(best_single_feature_result, headers=["Model", "Best Feature", "Accuracy"], tablefmt="grid"))

# Print table for all feature trained model results
all_feature_results = []
for model, accuracy in all_feature_result.items():
    all_feature_results.append([model, f"{accuracy:.4f}"])

# Print table for all feature results
print("\nResults using all features:")
print(tabulate(all_feature_results, headers=["Model", "Accuracy"], tablefmt="grid"))

Results using the best single feature:
+---------------------+----------------+------------+
| Model               | Best Feature   |   Accuracy |
| MLP                 | f1             |     0.6595 |
+---------------------+----------------+------------+
| Logistic Regression | f1             |     0.6527 |
+---------------------+----------------+------------+
| KNN                 | f1             |     0.6561 |
+---------------------+----------------+------------+

Results using all features:
+---------------------+------------+
| Model               |   Accuracy |
| MLP                 |     0.7161 |
+---------------------+------------+
| Logistic Regression |     0.7048 |
+---------------------+------------+
| KNN                 |     0.7002 |
+---------------------+------------+


**Justify Single Feature Choices**

In the training of my model, my code chose feature f1 as the best feature for training the model. This was determined by using cross validation score. For the MLP best feature, f1 was chosen because it most likely contains patterns that indicate a complex correlation with the output rather than linear seperability. For the Logistic Regression, the best feature was f1 and it was chosen likely because it had the strongest linear correlation with the class variable. For the KNN best feature, f1 was chosen because it most likely provides a good seperation between the two different classes in regards to the distance based appraoch that KNN uses.

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tabulate import tabulate

# Load Data
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# Extract features and labels
X_train = train_df.drop(columns=["class"])
y_train = train_df["class"]
X_test = test_df.drop(columns=["class"])
y_test = test_df["class"]

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Models
models = {
    "MLP": MLPClassifier(max_iter=1000, early_stopping=True),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier()
}

# Grid Params
param = {
    "MLP": {"hidden_layer_sizes": [(2,), (5,), (10,), (15,), (20,), (50,), (50, 50), (100, 100), (200, 200), (100, 100, 100), (200, 200, 200), (500, 500, 500), (1000, 1000, 1000)]},
    "Logistic Regression": {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
    "KNN": {"n_neighbors": range(1, 21)}
}

# Tune Models and store best parameters
tuned_models = {}
best_params = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param[model_name], cv=5)
    grid_search.fit(X_train_scaled, y_train)
    tuned_models[model_name] = grid_search.best_estimator_
    best_params[model_name] = grid_search.best_params_

# CROSS VALIDATION FOR EACH FEATURE
feat_score = {}
for feature in X_train.columns:
    X_feature = X_train[[feature]]  # Use only one feature
    
    scores = {}
    for model_name, model in tuned_models.items():
        score = np.mean(cross_val_score(model, X_feature, y_train, cv=5))
        scores[model_name] = score
    
    feat_score[feature] = scores

# Best Feature for Each Model
best_features = {model: max(feat_score, key=lambda f: feat_score[f][model]) for model in tuned_models}

# Use Best Feature to Train Models
results_single_feature = {}
for model_name, feature in best_features.items():
    model = tuned_models[model_name]
    model.fit(X_train[[feature]], y_train)
    accuracy = model.score(X_test[[feature]], y_test)
    results_single_feature[model_name] = (feature, accuracy)

# TRAIN MODELS USING ALL FEATURES
all_feature_result = {}
for model_name, model in tuned_models.items():
    model.fit(X_train_scaled, y_train)
    accuracy = model.score(X_test_scaled, y_test)
    all_feature_result[model_name] = accuracy

# TABULATE PRINT RESULTS SECTION

# Print table for best feature results
best_single_feature_result = []
for model, (feature, accuracy) in results_single_feature.items():
    best_single_feature_result.append([model, feature, f"{accuracy:.4f}"])

# Print table for all feature trained model results
all_feature_results = []
for model, accuracy in all_feature_result.items():
    all_feature_results.append([model, f"{accuracy:.4f}"])

# Print table for best parameters
best_param_results = []
for model, params in best_params.items():
    param_str = ', '.join([f"{key}: {value}" for key, value in params.items()])
    best_param_results.append([model, param_str])

# Print table for best single feature results
print("Results using the best single feature:")
print(tabulate(best_single_feature_result, headers=["Model", "Best Feature", "Accuracy"], tablefmt="grid"))

# Print table for all feature trained model results
print("\nResults using all features:")
print(tabulate(all_feature_results, headers=["Model", "Accuracy"], tablefmt="grid"))

# Print table for best parameters for each model
print("\nBest Parameters for each model:")
print(tabulate(best_param_results, headers=["Model", "Best Parameters"], tablefmt="grid"))


Results using the best single feature:
+---------------------+----------------+------------+
| Model               | Best Feature   |   Accuracy |
| MLP                 | f1             |     0.6629 |
+---------------------+----------------+------------+
| Logistic Regression | f1             |     0.6527 |
+---------------------+----------------+------------+
| KNN                 | f1             |     0.6561 |
+---------------------+----------------+------------+

Results using all features:
+---------------------+------------+
| Model               |   Accuracy |
| MLP                 |     0.7161 |
+---------------------+------------+
| Logistic Regression |     0.7048 |
+---------------------+------------+
| KNN                 |     0.7002 |
+---------------------+------------+

Best Parameters for each model:
+---------------------+--------------------------------+
| Model               | Best Parameters                |
| MLP                 | hidden_layer_sizes: (100, 100) |
