In [3]:
import os
import pandas as pd
import numpy as np
import ast
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

# 병합된 CSV 파일 경로 (원본 메타데이터와 feature가 결합된 파일)
train_csv_path = "df_balanced_updated.csv"
valid_csv_path = "df_valid_updated.csv"
test_csv_path  = "df_test_updated.csv"

# --- Load Merged Data ---
df_train = pd.read_csv(train_csv_path)
df_valid = pd.read_csv(valid_csv_path)
df_test  = pd.read_csv(test_csv_path)

# --- Helper Function: 문자열 벡터 파싱 ---
def parse_vector(x):
    if not isinstance(x, str):
        return x
    x = x.strip()
    if x.startswith('[') and x.endswith(']'):
        try:
            # 우선 ast.literal_eval 시도
            return ast.literal_eval(x)
        except Exception:
            # 파싱 실패 시, 대괄호 제거 후 공백 기준으로 분리하여 float 리스트로 변환
            x = x[1:-1].strip()  # 앞뒤 대괄호 제거
            tokens = x.split()   # 공백(및 개행) 기준 분리
            try:
                return [float(token) for token in tokens if token]
            except Exception:
                return None
    return x

# --- Preprocessing: 벡터 형태의 컬럼 확장하기 ---
def expand_vector_columns(df, vector_cols):
    for col in vector_cols:
        # 각 셀에 대해 parse_vector 적용
        df[col] = df[col].apply(parse_vector)
        # 리스트 형태라면, 각 원소를 개별 컬럼으로 확장 (리스트 길이가 모두 동일하다고 가정)
        vector_df = pd.DataFrame(df[col].tolist(), index=df.index)
        vector_df = vector_df.add_prefix(col + '_')
        df = df.drop(col, axis=1).join(vector_df)
    return df

# 확장할 컬럼 리스트 (필요에 따라 수정 가능)
vector_columns = ['mfcc_mean', 'mfcc_std', 'mfcc_delta_mean', 'mfcc_delta_std']
df_train = expand_vector_columns(df_train, vector_columns)
df_valid = expand_vector_columns(df_valid, vector_columns)
df_test  = expand_vector_columns(df_test, vector_columns)

# --- Prepare Target and Features ---
target_col = 'accent_encoded'
# 학습에 사용하지 않을 메타데이터 컬럼 지정 (target 포함)
exclude_cols = ['filename', 'gender', 'accent', 'age', target_col]
# 전체 컬럼 중에서 제외할 컬럼을 제거하여 특성 컬럼 선택
feature_columns = [col for col in df_train.columns if col not in exclude_cols]

# X, y 배열로 추출
X_train = df_train[feature_columns].values
y_train = df_train[target_col].values

X_valid = df_valid[feature_columns].values
y_valid = df_valid[target_col].values

X_test = df_test[feature_columns].values
y_test = df_test[target_col].values

# --- Scale Features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled  = scaler.transform(X_test)

# --- Model Training and Validation ---
model_performance = {}

# 1) K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_valid_scaled)
acc_knn = accuracy_score(y_valid, y_pred_knn)
model_performance['KNN'] = acc_knn
print("KNN accuracy:", acc_knn)

# 2) Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_valid_scaled)
acc_dt = accuracy_score(y_valid, y_pred_dt)
model_performance['Decision Tree'] = acc_dt
print("Decision Tree accuracy:", acc_dt)

# 3) Support Vector Machine (SVM)
svm = SVC(kernel='rbf', C=1.0, random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_valid_scaled)
acc_svm = accuracy_score(y_valid, y_pred_svm)
model_performance['SVM'] = acc_svm
print("SVM accuracy:", acc_svm)

# 4) Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_valid_scaled)
acc_rf = accuracy_score(y_valid, y_pred_rf)
model_performance['Random Forest'] = acc_rf
print("Random Forest accuracy:", acc_rf)

# 5) Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train_scaled, y_train)
y_pred_gb = gb.predict(X_valid_scaled)
acc_gb = accuracy_score(y_valid, y_pred_gb)
model_performance['Gradient Boosting'] = acc_gb
print("Gradient Boosting accuracy:", acc_gb)

# 6) AdaBoost
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train_scaled, y_train)
y_pred_ada = ada.predict(X_valid_scaled)
acc_ada = accuracy_score(y_valid, y_pred_ada)
model_performance['AdaBoost'] = acc_ada
print("AdaBoost accuracy:", acc_ada)

# 7) XGBoost
xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1,
                            random_state=42, use_label_encoder=False,
                            eval_metric='mlogloss')
xgb_clf.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_clf.predict(X_valid_scaled)
acc_xgb = accuracy_score(y_valid, y_pred_xgb)
model_performance['XGBoost'] = acc_xgb
print("XGBoost accuracy:", acc_xgb)

# 8) Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_valid_scaled)
acc_lr = accuracy_score(y_valid, y_pred_lr)
model_performance['Logistic Regression'] = acc_lr
print("Logistic Regression accuracy:", acc_lr)

# --- Determine the Best Model on Validation Set and Evaluate on Test Set ---
best_model_name = max(model_performance, key=model_performance.get)
print("\nBest model on validation set:", best_model_name)

if best_model_name == 'KNN':
    best_model = knn
elif best_model_name == 'Decision Tree':
    best_model = dt
elif best_model_name == 'SVM':
    best_model = svm
elif best_model_name == 'Random Forest':
    best_model = rf
elif best_model_name == 'Gradient Boosting':
    best_model = gb
elif best_model_name == 'AdaBoost':
    best_model = ada
elif best_model_name == 'XGBoost':
    best_model = xgb_clf
elif best_model_name == 'Logistic Regression':
    best_model = lr

y_test_pred = best_model.predict(X_test_scaled)
acc_test = accuracy_score(y_test, y_test_pred)
print("Test accuracy for best model ({}): {:.4f}".format(best_model_name, acc_test))


KNN accuracy: 0.607
Decision Tree accuracy: 0.455
SVM accuracy: 0.584
Random Forest accuracy: 0.5655
Gradient Boosting accuracy: 0.493
AdaBoost accuracy: 0.417


Parameters: { "use_label_encoder" } are not used.



XGBoost accuracy: 0.562
Logistic Regression accuracy: 0.4415

Best model on validation set: KNN
Test accuracy for best model (KNN): 0.5634
