In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the data
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/IO.csv'
data = pd.read_csv(data_path)

# Assume 'label' is the name of the column to predict
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree Classifier
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# Make predictions
y_pred = tree_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Get feature importances
importances = tree_model.feature_importances_

# Create a DataFrame of features and their importances
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort the DataFrame by importance in descending order
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Print sorted feature importances
print("Sorted Feature Importances:")
print(feature_importances_sorted)


Accuracy: 0.4914
Sorted Feature Importances:
                       feature  importance
28      incoming_signal_energy    0.081220
39           outgoing_duration    0.050088
8         incoming_fft_std_dev    0.049175
6            incoming_duration    0.048865
27     incoming_cumulative_sum    0.047595
..                         ...         ...
30  incoming_spectral_kurtosis    0.003494
19      incoming_spectral_flux    0.003467
35             outgoing_median    0.001629
31  incoming_spectral_skewness    0.000000
64  outgoing_spectral_skewness    0.000000

[66 rows x 2 columns]


In [4]:
from sklearn.preprocessing import StandardScaler

# Initialize results list to store accuracies for different feature counts
results = []

# Loop through feature subsets from top 5 to top 15 features
for top_n in range(5, 30):  # Range from 5 to 15
    # Select the top 'top_n' features based on importance
    top_features = feature_importances_sorted['feature'].head(top_n)
    
    # Subset the training and testing sets to the top 'top_n' features
    X_train_reduced = X_train[top_features]
    X_test_reduced = X_test[top_features]

    # Scale the reduced feature sets
    scaler = StandardScaler()
    X_train_reduced_scaled = scaler.fit_transform(X_train_reduced)
    X_test_reduced_scaled = scaler.transform(X_test_reduced)

    # Initialize and train a new Decision Tree Classifier on reduced feature set
    model_reduced = DecisionTreeClassifier(random_state=42)
    model_reduced.fit(X_train_reduced_scaled, y_train)
    
    # Make predictions with the reduced model
    y_pred_reduced = model_reduced.predict(X_test_reduced_scaled)
    
    # Calculate and store accuracy
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")


Top 5 Features Model Accuracy: 0.3149
Top 6 Features Model Accuracy: 0.3407
Top 7 Features Model Accuracy: 0.4251
Top 8 Features Model Accuracy: 0.4607
Top 9 Features Model Accuracy: 0.4901
Top 10 Features Model Accuracy: 0.4805
Top 11 Features Model Accuracy: 0.4930
Top 12 Features Model Accuracy: 0.4994
Top 13 Features Model Accuracy: 0.5043
Top 14 Features Model Accuracy: 0.5037
Top 15 Features Model Accuracy: 0.5083
Top 16 Features Model Accuracy: 0.5067
Top 17 Features Model Accuracy: 0.5014
Top 18 Features Model Accuracy: 0.5001
Top 19 Features Model Accuracy: 0.5031
Top 20 Features Model Accuracy: 0.4975
Top 21 Features Model Accuracy: 0.4992
Top 22 Features Model Accuracy: 0.4964
Top 23 Features Model Accuracy: 0.5059
Top 24 Features Model Accuracy: 0.5022
Top 25 Features Model Accuracy: 0.5001
Top 26 Features Model Accuracy: 0.5009
Top 27 Features Model Accuracy: 0.5068
Top 28 Features Model Accuracy: 0.5032
Top 29 Features Model Accuracy: 0.4994

Final List of Accuracies for 

In [5]:
X_train[top_features].head()

Unnamed: 0,incoming_signal_energy,outgoing_duration,incoming_fft_std_dev,incoming_duration,incoming_cumulative_sum,incoming_entropy_of_energy,outgoing_entropy_of_energy,outgoing_entropy_packet_distribution,outgoing_fft_std_dev,incoming_thd,...,incoming_kurtosis,outgoing_mean_absolute_deviation,incoming_max_autocorrelation_peak,incoming_entropy,outgoing_thd,incoming_waveform_length,incoming_skewness,outgoing_num_peaks,incoming_first_diff_mean,outgoing_sma
25449,80042456,7.236268,7934.552536,7.176099,77124,4.050953,3.822722,4.781837,4849.53739,1.072214,...,-1.367147,194.15,35653570.0,1.421044,0.752257,19520,0.659775,10,0.240602,330.4
25626,695471628,15.327103,23767.609597,15.254672,560522,5.969797,5.353845,6.431313,7095.980617,0.932506,...,-1.894065,111.007397,240790100.0,1.194581,0.770062,168800,-0.029627,96,0.417391,227.56611
14555,72260772,6.42713,7542.899075,6.438063,74414,3.934239,4.175953,4.905218,8804.123459,1.045752,...,0.022319,382.332163,39494830.0,1.040901,0.749927,14880,1.352885,9,0.0,481.859296
36391,313731640,36.111888,16457.864398,36.104909,250356,5.152287,4.6143,5.617743,5992.810865,0.989619,...,-1.932306,113.394048,116630600.0,1.224928,0.736035,40224,0.022238,24,-0.100946,286.094118
29020,76942824,10.794659,7687.557566,10.787229,74604,3.934395,4.120054,5.101266,5224.974853,1.061495,...,-0.638122,139.909266,40801550.0,1.084524,0.814687,14992,1.109182,8,-0.104575,326.784314
