In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Load the dataset
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/no_trim/IO.csv'  # Update this path to your actual data file
data = pd.read_csv(data_path)

# Assuming there's a column named 'label' for the target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Var_smoothing parameters to tune
var_smoothing_params = [1e-09, 1e-08, 1e-07, 1e-06]

# Initialize the best score
best_score = 0
best_param = None

# Iterate over var_smoothing values
for var_smoothing in var_smoothing_params:
    # Create a pipeline with PCA and Naive Bayes
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('pca', PCA(n_components=25)),  # Reduce dimensionality
        ('nb', GaussianNB(var_smoothing=var_smoothing))  # Naive Bayes with current var_smoothing
    ])
    
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    
    # Update best score and parameter if current score is higher
    if accuracy > best_score:
        best_score = accuracy
        best_param = var_smoothing

print("Best Score:", best_score)
print("Best var_smoothing Parameter:", best_param)
print("Classification Report for Best Parameter:")
print(classification_report(y_test, pipeline.predict(X_test)))


Best Score: 0.32421975754449317
Best var_smoothing Parameter: 1e-09
Classification Report for Best Parameter:
              precision    recall  f1-score   support

           1       0.19      0.09      0.12        34
           2       0.00      0.00      0.00        43
           3       0.28      0.12      0.17        40
           4       0.39      0.24      0.30        38
           5       0.14      0.05      0.07        40
           6       0.16      0.33      0.21        42
           7       0.53      0.55      0.54        49
           8       0.31      0.51      0.39        43
           9       0.07      0.03      0.04        39
          10       0.22      0.10      0.14        41
          11       0.21      0.68      0.32        31
          12       0.20      0.22      0.21        46
          13       0.12      0.08      0.10        25
          14       0.34      0.45      0.39        33
          15       0.23      0.08      0.12        39
          16       0.23  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
