# Preprocessing

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = './data_A_T05.csv'
df = pd.read_csv(file_path)

# Task 1: Change the value of Y_Class from 2 to 1
df['Y_Class'].replace(2, 1, inplace=True)

# Task 2: Remove columns with more than 50% missing values
threshold = 0.5 * len(df)
df.dropna(thresh=threshold, axis=1, inplace=True)

# Task 3: Remove columns with only one unique value
cols_to_remove = [col for col in df.columns if df[col].nunique() <= 1]
df.drop(columns=cols_to_remove, inplace=True)

# Task 4: Remove columns with any NaN or infinite values in numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
cols_to_remove = [col for col in numerical_cols if df[col].isna().any() or np.isinf(df[col].astype(float)).any()]
df.drop(columns=cols_to_remove, inplace=True)

# Task 5: Remove the columns 'LINE' and 'Y_Quality'
cols_to_remove = ['LINE', 'Y_Quality']
df.drop(columns=cols_to_remove, inplace=True, errors='ignore')  # errors='ignore' to skip if the column doesn't exist

# Task 6: Standardize the dataset, excluding 'Y_Class'
features_to_standardize = [col for col in df.columns if col != 'Y_Class']

scaler = StandardScaler()
df[features_to_standardize] = scaler.fit_transform(df[features_to_standardize])

# Task 7: Split the dataset into training and testing sets with class stratification
X = df.drop('Y_Class', axis=1)
y = df['Y_Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verify the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((96, 776), (24, 776), (96,), (24,))

# Training & Performance Results

In [3]:
# Importing required libraries for model training and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Initialize the models
logistic_model = LogisticRegression(random_state=42)
lasso_logistic_model = LogisticRegression(penalty='l1', solver='saga', random_state=42)
decision_tree_model = DecisionTreeClassifier(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
knn_model = KNeighborsClassifier()
one_class_svm_model = OneClassSVM()

# Create a dictionary to store the models and their names
models = {
    'Logistic Regression': logistic_model,
    'Logistic Regression with Lasso': lasso_logistic_model,
    'Decision Tree': decision_tree_model,
    'Random Forest': random_forest_model,
    'KNN': knn_model
}

# Dictionary to store classification reports
classification_reports = {}

# Train the models and evaluate their performance
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    classification_reports[model_name] = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'], output_dict=True)

# Train the One Class SVM model considering only Y_Class = 0
one_class_svm_model.fit(X_train[y_train == 0])

# Predict using One Class SVM
y_pred_one_class = one_class_svm_model.predict(X_test)
y_pred_one_class = np.where(y_pred_one_class == 1, 0, 1)  # Converting -1 to 1 and 1 to 0 for evaluation

# Evaluate the One Class SVM model
classification_reports['One Class SVM'] = classification_report(y_test, y_pred_one_class, target_names=['Class 0', 'Class 1'], output_dict=True)

classification_reports



{'Logistic Regression': {'Class 0': {'precision': 0.6666666666666666,
   'recall': 0.5,
   'f1-score': 0.5714285714285715,
   'support': 8},
  'Class 1': {'precision': 0.7777777777777778,
   'recall': 0.875,
   'f1-score': 0.823529411764706,
   'support': 16},
  'accuracy': 0.75,
  'macro avg': {'precision': 0.7222222222222222,
   'recall': 0.6875,
   'f1-score': 0.6974789915966387,
   'support': 24},
  'weighted avg': {'precision': 0.7407407407407408,
   'recall': 0.75,
   'f1-score': 0.7394957983193278,
   'support': 24}},
 'Logistic Regression with Lasso': {'Class 0': {'precision': 0.6666666666666666,
   'recall': 1.0,
   'f1-score': 0.8,
   'support': 8},
  'Class 1': {'precision': 1.0,
   'recall': 0.75,
   'f1-score': 0.8571428571428571,
   'support': 16},
  'accuracy': 0.8333333333333334,
  'macro avg': {'precision': 0.8333333333333333,
   'recall': 0.875,
   'f1-score': 0.8285714285714285,
   'support': 24},
  'weighted avg': {'precision': 0.8888888888888888,
   'recall': 0.833

## 표로 깔끔하게 정리

In [4]:
# Create an empty list to store the class-wise metrics for all models
class_report_df_list = []

# Loop through the classification reports to generate class-wise performance metrics
for model_name, class_report in classification_reports.items():
    for class_label, class_metrics in class_report.items():
        if class_label in ['Class 0', 'Class 1']:  # Only consider the individual classes
            class_metrics['Model'] = model_name
            class_metrics['Class'] = class_label
            class_report_df_list.append(class_metrics)

# Create a DataFrame to store the class-wise metrics
class_report_df = pd.DataFrame(class_report_df_list)

# Reorder the columns
class_report_df = class_report_df[['Model', 'Class', 'precision', 'recall', 'f1-score', 'support']]

class_report_df


Unnamed: 0,Model,Class,precision,recall,f1-score,support
0,Logistic Regression,Class 0,0.666667,0.5,0.571429,8
1,Logistic Regression,Class 1,0.777778,0.875,0.823529,16
2,Logistic Regression with Lasso,Class 0,0.666667,1.0,0.8,8
3,Logistic Regression with Lasso,Class 1,1.0,0.75,0.857143,16
4,Decision Tree,Class 0,0.4,0.5,0.444444,8
5,Decision Tree,Class 1,0.714286,0.625,0.666667,16
6,Random Forest,Class 0,1.0,0.625,0.769231,8
7,Random Forest,Class 1,0.842105,1.0,0.914286,16
8,KNN,Class 0,1.0,0.375,0.545455,8
9,KNN,Class 1,0.761905,1.0,0.864865,16
