In [2]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/google/no_trim/IO.csv'  # Update this path to your actual data file
data = pd.read_csv(data_path)

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)


# Assuming there's a column named 'label' for the target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that applies PCA and then runs the KNN classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features by removing the mean and scaling to unit variance
    ('pca', PCA(n_components=25)),  # Reduce dimensionality using PCA to 25 components
    ('knn', KNeighborsClassifier(n_neighbors=5))  # Apply KNN classifier
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.2925682031984948
Classification Report:
              precision    recall  f1-score   support

           1       0.06      0.32      0.10        22
           2       0.13      0.36      0.19        39
           3       0.04      0.12      0.06        33
           4       0.07      0.22      0.11        27
           5       0.21      0.45      0.29        29
           6       0.18      0.35      0.24        34
           7       0.03      0.06      0.04        36
           8       0.06      0.12      0.08        24
           9       0.30      0.29      0.29        42
          10       0.10      0.23      0.14        26
          11       0.08      0.16      0.11        31
          13       0.08      0.22      0.12        27
          14       0.08      0.24      0.12        25
          15       0.08      0.11      0.09        35
          16       0.09      0.12      0.10        34
          17       0.15      0.20      0.17        35
          18       0.16      

In [4]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Load the dataset
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/google/no_trim/IO.csv'  # Update this path to your actual data file
data = pd.read_csv(data_path)

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Assuming there's a column named 'label' for the target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameter grid setup
params = {
    # 'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
    'n_neighbors': [9],  # Number of neighbors
    # 'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'weights': ['distance'],  # Weight function used in prediction

    # 'metric': ['euclidean', 'manhattan']  # Distance metric for choosing the neighbors
    'metric': ['manhattan']  # Distance metric for choosing the neighbors
}

# Initialize the best score
best_score = 0
best_params = {}

# Iterate over all combinations of parameters
for n in params['n_neighbors']:
    for w in params['weights']:
        for m in params['metric']:
            # Create a pipeline with the current set of parameters
            pipeline = Pipeline([
                ('scaler', StandardScaler()),  # Standardize features
                ('pca', PCA(n_components=25)),  # Reduce dimensionality
                ('knn', KNeighborsClassifier(n_neighbors=n, weights=w, metric=m))  # KNN with current params
            ])
            
            # Fit the pipeline on the training data
            pipeline.fit(X_train, y_train)
            
            # Make predictions on the test set
            y_pred = pipeline.predict(X_test)
            
            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            
            # Update best score and parameters if current score is higher
            if accuracy > best_score:
                best_score = accuracy
                best_params = {'n_neighbors': n, 'weights': w, 'metric': m}

print("Best Score:", best_score)
print("Best Parameters:")
print(best_params)
print("Classification Report for Best Parameters:")
print(classification_report(y_test, pipeline.predict(X_test)))


Best Score: 0.3576356224521794
Best Parameters:
{'n_neighbors': 9, 'weights': 'distance', 'metric': 'manhattan'}
Classification Report for Best Parameters:
              precision    recall  f1-score   support

           1       0.18      0.27      0.22        22
           2       0.23      0.23      0.23        39
           3       0.03      0.03      0.03        33
           4       0.14      0.19      0.16        27
           5       0.33      0.41      0.37        29
           6       0.39      0.38      0.39        34
           7       0.07      0.06      0.06        36
           8       0.17      0.21      0.19        24
           9       0.43      0.24      0.31        42
          10       0.14      0.23      0.17        26
          11       0.16      0.10      0.12        31
          13       0.11      0.15      0.12        27
          14       0.16      0.24      0.19        25
          15       0.18      0.11      0.14        35
          16       0.30      0.21