In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Load your dataset
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/no_trim/IO.csv'
data = pd.read_csv(data_path)

# Cleaning and preprocessing
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['label'])
X = data.drop('label', axis=1)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Preprocessing pipeline
pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
])
pipeline.fit(X_train, y_train)
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# XGBoost classifier
classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
classifier.fit(X_train_transformed, y_train)

# Prediction and evaluation
y_pred = classifier.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)
print("Classification Report on test set:")
print(classification_report(y_test, y_pred))

# Feature importances
importances = classifier.feature_importances_
features = X.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': importances})
sorted_importances = importances_df.sort_values(by='Importance', ascending=False)

# Display sorted feature importances
print("Sorted Feature Importances:")
print(sorted_importances)


Accuracy on test set: 0.6115553262832086
Classification Report on test set:
              precision    recall  f1-score   support

           0       0.43      0.47      0.45        34
           1       0.52      0.28      0.36        43
           2       0.57      0.62      0.60        40
           3       0.47      0.50      0.49        38
           4       0.24      0.30      0.27        40
           5       0.37      0.33      0.35        42
           6       0.70      0.71      0.71        49
           7       0.68      0.60      0.64        43
           8       0.45      0.36      0.40        39
           9       0.44      0.29      0.35        41
          10       0.57      0.74      0.65        31
          11       0.63      0.52      0.57        46
          12       0.35      0.32      0.33        25
          13       0.51      0.67      0.58        33
          14       0.75      0.69      0.72        39
          15       0.59      0.52      0.55        46
     

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Initialize results list to store accuracies for different feature counts
results = []

# The transformed training set may not retain the feature names, we need to reassign them
X_train_transformed = pd.DataFrame(X_train_transformed, columns=X.columns)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=X.columns)

# Loop through feature subsets from top 5 to top 20 features
for top_n in range(5, 21):  # Adjust range to top 20
    # Select the top 'top_n' features based on importance
    top_features = sorted_importances['Feature'].head(top_n).tolist()  # Adjust variable name to sorted_importances
    
    # Subset the training and testing sets to the top 'top_n' features
    X_train_reduced = X_train_transformed[top_features]
    X_test_reduced = X_test_transformed[top_features]

    # Scale the reduced feature sets
    scaler = StandardScaler()
    X_train_reduced_scaled = scaler.fit_transform(X_train_reduced)
    X_test_reduced_scaled = scaler.transform(X_test_reduced)

    # Reinitialize and retrain the XGBoost Classifier on reduced feature set
    model_reduced = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    model_reduced.fit(X_train_reduced_scaled, y_train)

    # Make predictions with the reduced model
    y_pred_reduced = model_reduced.predict(X_test_reduced_scaled)

    # Calculate and store accuracy
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")


Top 5 Features Model Accuracy: 0.4564
Top 6 Features Model Accuracy: 0.5156
Top 7 Features Model Accuracy: 0.5618
Top 8 Features Model Accuracy: 0.5740
Top 9 Features Model Accuracy: 0.5814
Top 10 Features Model Accuracy: 0.5859
Top 11 Features Model Accuracy: 0.5932
Top 12 Features Model Accuracy: 0.5978
Top 13 Features Model Accuracy: 0.6019
Top 14 Features Model Accuracy: 0.5950
Top 15 Features Model Accuracy: 0.5978
Top 16 Features Model Accuracy: 0.6046
Top 17 Features Model Accuracy: 0.6039
Top 18 Features Model Accuracy: 0.6059
Top 19 Features Model Accuracy: 0.6060
Top 20 Features Model Accuracy: 0.6072

Final List of Accuracies for Each Feature Count:
Top 5 Features: Accuracy = 0.4564
Top 6 Features: Accuracy = 0.5156
Top 7 Features: Accuracy = 0.5618
Top 8 Features: Accuracy = 0.5740
Top 9 Features: Accuracy = 0.5814
Top 10 Features: Accuracy = 0.5859
Top 11 Features: Accuracy = 0.5932
Top 12 Features: Accuracy = 0.5978
Top 13 Features: Accuracy = 0.6019
Top 14 Features: Accu

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Initialize results list to store accuracies for different feature counts
results = []

# The transformed training set may not retain the feature names, we need to reassign them
X_train_transformed = pd.DataFrame(X_train_transformed, columns=X.columns)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=X.columns)

# Loop through feature subsets from top 5 to top 20 features
for top_n in range(21, 30):  # Adjust range to top 20
    # Select the top 'top_n' features based on importance
    top_features = sorted_importances['Feature'].head(top_n).tolist()  # Adjust variable name to sorted_importances
    
    # Subset the training and testing sets to the top 'top_n' features
    X_train_reduced = X_train_transformed[top_features]
    X_test_reduced = X_test_transformed[top_features]

    # Scale the reduced feature sets
    scaler = StandardScaler()
    X_train_reduced_scaled = scaler.fit_transform(X_train_reduced)
    X_test_reduced_scaled = scaler.transform(X_test_reduced)

    # Reinitialize and retrain the XGBoost Classifier on reduced feature set
    model_reduced = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    model_reduced.fit(X_train_reduced_scaled, y_train)

    # Make predictions with the reduced model
    y_pred_reduced = model_reduced.predict(X_test_reduced_scaled)

    # Calculate and store accuracy
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")


Top 21 Features Model Accuracy: 0.6108
Top 22 Features Model Accuracy: 0.6105
Top 23 Features Model Accuracy: 0.6090
Top 24 Features Model Accuracy: 0.6118
Top 25 Features Model Accuracy: 0.6119
Top 26 Features Model Accuracy: 0.6125
Top 27 Features Model Accuracy: 0.6101
Top 28 Features Model Accuracy: 0.6153
Top 29 Features Model Accuracy: 0.6162

Final List of Accuracies for Each Feature Count:
Top 21 Features: Accuracy = 0.6108
Top 22 Features: Accuracy = 0.6105
Top 23 Features: Accuracy = 0.6090
Top 24 Features: Accuracy = 0.6118
Top 25 Features: Accuracy = 0.6119
Top 26 Features: Accuracy = 0.6125
Top 27 Features: Accuracy = 0.6101
Top 28 Features: Accuracy = 0.6153
Top 29 Features: Accuracy = 0.6162


In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Initialize results list to store accuracies for different feature counts
results = []

# The transformed training set may not retain the feature names, we need to reassign them
X_train_transformed = pd.DataFrame(X_train_transformed, columns=X.columns)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=X.columns)

# Loop through feature subsets from top 5 to top 20 features
for top_n in range(30, 35):  # Adjust range to top 20
    # Select the top 'top_n' features based on importance
    top_features = sorted_importances['Feature'].head(top_n).tolist()  # Adjust variable name to sorted_importances
    
    # Subset the training and testing sets to the top 'top_n' features
    X_train_reduced = X_train_transformed[top_features]
    X_test_reduced = X_test_transformed[top_features]

    # Scale the reduced feature sets
    scaler = StandardScaler()
    X_train_reduced_scaled = scaler.fit_transform(X_train_reduced)
    X_test_reduced_scaled = scaler.transform(X_test_reduced)

    # Reinitialize and retrain the XGBoost Classifier on reduced feature set
    model_reduced = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    model_reduced.fit(X_train_reduced_scaled, y_train)

    # Make predictions with the reduced model
    y_pred_reduced = model_reduced.predict(X_test_reduced_scaled)

    # Calculate and store accuracy
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")


Top 30 Features Model Accuracy: 0.6223
Top 31 Features Model Accuracy: 0.6166
Top 32 Features Model Accuracy: 0.6170
Top 33 Features Model Accuracy: 0.6192
Top 34 Features Model Accuracy: 0.6214

Final List of Accuracies for Each Feature Count:
Top 30 Features: Accuracy = 0.6223
Top 31 Features: Accuracy = 0.6166
Top 32 Features: Accuracy = 0.6170
Top 33 Features: Accuracy = 0.6192
Top 34 Features: Accuracy = 0.6214
