In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load the data
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/google/no_trim/IO.csv'
data = pd.read_csv(data_path)

# Replace infinite values with NaN to handle them properly
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values using the median of each column
imputer = SimpleImputer(strategy='median')
X = data.drop('label', axis=1)
y = data['label']
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree Classifier
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# Make predictions
y_pred = tree_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Get feature importances
importances = tree_model.feature_importances_

# Create a DataFrame of features and their importances
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort the DataFrame by importance in descending order
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Print sorted feature importances
print("Sorted Feature Importances:")
print(feature_importances_sorted)


Accuracy: 0.3853
Sorted Feature Importances:
                       feature  importance
8         incoming_fft_std_dev    0.047464
39           outgoing_duration    0.044713
42            outgoing_entropy    0.040175
6            incoming_duration    0.036188
28      incoming_signal_energy    0.035702
..                         ...         ...
29  incoming_spectral_flatness    0.004469
30  incoming_spectral_kurtosis    0.004344
2              incoming_median    0.001065
31  incoming_spectral_skewness    0.000000
64  outgoing_spectral_skewness    0.000000

[66 rows x 2 columns]


In [9]:
# Select the top 15 features based on importance
top_features = feature_importances_sorted.head(15)['feature'].values

# Subset the training and testing sets to include only the top 15 features
indices = [list(X.columns).index(feat) for feat in top_features]  # Find column indices for top features
X_train_reduced = X_train[:, indices]
X_test_reduced = X_test[:, indices]

# Retrain the model on the reduced dataset
model_reduced = DecisionTreeClassifier(random_state=42)
model_reduced.fit(X_train_reduced, y_train)

# Make predictions with the reduced model
y_pred_reduced = model_reduced.predict(X_test_reduced)

# Evaluate the reduced model
accuracy_reduced = accuracy_score(y_test, y_pred_reduced)

# Print the model evaluation metrics for the reduced model
print(f"Top 15 Features Model Accuracy: {accuracy_reduced:.4f}")


Top 15 Features Model Accuracy: 0.4086
