In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/google/no_trim/IO.csv'  # Update this path to your actual data file
data = pd.read_csv(data_path)

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['label'])
X = data.drop('label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=25)),  # Adding PCA with 25 components
    ('classifier', LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial', solver='lbfgs'))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.3247099404201944
Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.36      0.27        22
           1       0.19      0.15      0.17        39
           2       0.05      0.03      0.04        33
           3       0.07      0.04      0.05        27
           4       0.11      0.10      0.11        29
           5       0.32      0.26      0.29        34
           6       0.11      0.06      0.07        36
           7       0.10      0.04      0.06        24
           8       0.44      0.26      0.33        42
           9       0.09      0.12      0.10        26
          10       0.14      0.06      0.09        31
          11       0.15      0.15      0.15        27
          12       0.09      0.04      0.06        25
          13       0.00      0.00      0.00        35
          14       0.25      0.32      0.28        34
          15       0.35      0.26      0.30        35
          16       0.22      