In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Load the dataset
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/no_trim/IO.csv'  # Update this path to your actual data file
data = pd.read_csv(data_path)

# Assuming there's a column named 'label' for the target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that applies PCA and then runs the KNN classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features by removing the mean and scaling to unit variance
    ('pca', PCA(n_components=25)),  # Reduce dimensionality using PCA to 25 components
    ('knn', KNeighborsClassifier(n_neighbors=5))  # Apply KNN classifier
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.3863812225947898
Classification Report:
              precision    recall  f1-score   support

           1       0.19      0.32      0.24        34
           2       0.11      0.21      0.14        43
           3       0.18      0.50      0.26        40
           4       0.29      0.50      0.37        38
           5       0.06      0.15      0.08        40
           6       0.22      0.38      0.28        42
           7       0.38      0.61      0.47        49
           8       0.39      0.56      0.46        43
           9       0.26      0.21      0.23        39
          10       0.11      0.17      0.14        41
          11       0.16      0.45      0.23        31
          12       0.15      0.28      0.20        46
          13       0.06      0.16      0.09        25
          14       0.42      0.39      0.41        33
          15       0.23      0.23      0.23        39
          16       0.30      0.17      0.22        46
          17       0.11      

In [5]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Load the dataset
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/no_trim/IO.csv'  # Update this path to your actual data file
data = pd.read_csv(data_path)

# Assuming there's a column named 'label' for the target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameter grid setup
params = {
    'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'metric': ['euclidean', 'manhattan']  # Distance metric for choosing the neighbors
}

# Initialize the best score
best_score = 0
best_params = {}

# Iterate over all combinations of parameters
for n in params['n_neighbors']:
    for w in params['weights']:
        for m in params['metric']:
            # Create a pipeline with the current set of parameters
            pipeline = Pipeline([
                ('scaler', StandardScaler()),  # Standardize features
                ('pca', PCA(n_components=25)),  # Reduce dimensionality
                ('knn', KNeighborsClassifier(n_neighbors=n, weights=w, metric=m))  # KNN with current params
            ])
            
            # Fit the pipeline on the training data
            pipeline.fit(X_train, y_train)
            
            # Make predictions on the test set
            y_pred = pipeline.predict(X_test)
            
            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            
            # Update best score and parameters if current score is higher
            if accuracy > best_score:
                best_score = accuracy
                best_params = {'n_neighbors': n, 'weights': w, 'metric': m}

print("Best Score:", best_score)
print("Best Parameters:")
print(best_params)
print("Classification Report for Best Parameters:")
print(classification_report(y_test, pipeline.predict(X_test)))


Best Score: 0.4286819705958215
Best Parameters:
{'n_neighbors': 9, 'weights': 'distance', 'metric': 'manhattan'}
Classification Report for Best Parameters:
              precision    recall  f1-score   support

           1       0.26      0.21      0.23        34
           2       0.19      0.09      0.12        43
           3       0.30      0.35      0.32        40
           4       0.50      0.47      0.49        38
           5       0.09      0.07      0.08        40
           6       0.24      0.26      0.25        42
           7       0.36      0.47      0.41        49
           8       0.55      0.51      0.53        43
           9       0.36      0.21      0.26        39
          10       0.21      0.10      0.13        41
          11       0.17      0.35      0.23        31
          12       0.22      0.15      0.18        46
          13       0.17      0.24      0.20        25
          14       0.55      0.55      0.55        33
          15       0.44      0.38