In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the data
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/IO.csv'
data = pd.read_csv(data_path)

# Handle infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Separate features and target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Print model evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

# Feature Importances
feature_importances = model.feature_importances_
features = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)

# Print all feature importances
print("Feature Importances:")
print(features)


Accuracy: 0.6334
Classification Report:
              precision    recall  f1-score   support

           1       0.52      0.50      0.51        34
           2       0.36      0.12      0.18        43
           3       0.56      0.70      0.62        40
           4       0.59      0.63      0.61        38
           5       0.28      0.23      0.25        40
           6       0.36      0.45      0.40        42
           7       0.66      0.76      0.70        49
           8       0.62      0.60      0.61        43
           9       0.59      0.49      0.54        39
          10       0.16      0.07      0.10        41
          11       0.44      0.77      0.56        31
          12       0.58      0.48      0.52        46
          13       0.36      0.48      0.41        25
          14       0.72      0.88      0.79        33
          15       0.90      0.67      0.76        39
          16       0.77      0.52      0.62        46
          17       0.58      0.37      0.

In [16]:
results = []

# Loop through feature subsets from top 20 to top 30 features
for top_n in range(5, 15):  # From top 20 to top 30 features
    # Select the top 'top_n' features based on importance
    top_features = features.index[:top_n]
    X_train_reduced = scaler.fit_transform(X_train[top_features])
    X_test_reduced = scaler.transform(X_test[top_features])
    
    # Initialize and train a new RandomForest model
    model_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Make predictions and evaluate the model
    y_pred_reduced = model_reduced.predict(X_test_reduced)
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    
    # Store the result
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")



Top 5 Features Model Accuracy: 0.4430
Top 6 Features Model Accuracy: 0.4454
Top 7 Features Model Accuracy: 0.5380
Top 8 Features Model Accuracy: 0.5484
Top 9 Features Model Accuracy: 0.6087
Top 10 Features Model Accuracy: 0.6145
Top 11 Features Model Accuracy: 0.6162
Top 12 Features Model Accuracy: 0.6294
Top 13 Features Model Accuracy: 0.6252
Top 14 Features Model Accuracy: 0.6295

Final List of Accuracies for Each Feature Count:
Top 5 Features: Accuracy = 0.4430
Top 6 Features: Accuracy = 0.4454
Top 7 Features: Accuracy = 0.5380
Top 8 Features: Accuracy = 0.5484
Top 9 Features: Accuracy = 0.6087
Top 10 Features: Accuracy = 0.6145
Top 11 Features: Accuracy = 0.6162
Top 12 Features: Accuracy = 0.6294
Top 13 Features: Accuracy = 0.6252
Top 14 Features: Accuracy = 0.6295


In [17]:
results = []

# Loop through feature subsets from top 20 to top 30 features
for top_n in range(15, 20):  # From top 20 to top 30 features
    # Select the top 'top_n' features based on importance
    top_features = features.index[:top_n]
    X_train_reduced = scaler.fit_transform(X_train[top_features])
    X_test_reduced = scaler.transform(X_test[top_features])
    
    # Initialize and train a new RandomForest model
    model_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Make predictions and evaluate the model
    y_pred_reduced = model_reduced.predict(X_test_reduced)
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    
    # Store the result
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")

Top 15 Features Model Accuracy: 0.6278
Top 16 Features Model Accuracy: 0.6332
Top 17 Features Model Accuracy: 0.6327
Top 18 Features Model Accuracy: 0.6318
Top 19 Features Model Accuracy: 0.6415

Final List of Accuracies for Each Feature Count:
Top 15 Features: Accuracy = 0.6278
Top 16 Features: Accuracy = 0.6332
Top 17 Features: Accuracy = 0.6327
Top 18 Features: Accuracy = 0.6318
Top 19 Features: Accuracy = 0.6415


In [18]:

results = []

# Loop through feature subsets from top 20 to top 30 features
for top_n in range(20, 25):  # From top 20 to top 30 features
    # Select the top 'top_n' features based on importance
    top_features = features.index[:top_n]
    X_train_reduced = scaler.fit_transform(X_train[top_features])
    X_test_reduced = scaler.transform(X_test[top_features])
    
    # Initialize and train a new RandomForest model
    model_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Make predictions and evaluate the model
    y_pred_reduced = model_reduced.predict(X_test_reduced)
    accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
    
    # Store the result
    results.append((top_n, accuracy_reduced))
    
    # Print the accuracy after each model run
    print(f"Top {top_n} Features Model Accuracy: {accuracy_reduced:.4f}")

# Print the final list of accuracies for each feature count
print("\nFinal List of Accuracies for Each Feature Count:")
for result in results:
    print(f"Top {result[0]} Features: Accuracy = {result[1]:.4f}")

Top 20 Features Model Accuracy: 0.6462
Top 21 Features Model Accuracy: 0.6456
Top 22 Features Model Accuracy: 0.6402
Top 23 Features Model Accuracy: 0.6424
Top 24 Features Model Accuracy: 0.6402

Final List of Accuracies for Each Feature Count:
Top 20 Features: Accuracy = 0.6462
Top 21 Features: Accuracy = 0.6456
Top 22 Features: Accuracy = 0.6402
Top 23 Features: Accuracy = 0.6424
Top 24 Features: Accuracy = 0.6402


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the data
data_path = '/Users/xiaoguang_guo@mines.edu/Documents/voice_attack_data/script/features_extraction/trimmed_1s/IO_trimmed.csv'
data = pd.read_csv(data_path)

# Handle infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Separate features and target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Print model evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

# Feature Importances
feature_importances = model.feature_importances_
features = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)

# Print all feature importances
print("Feature Importances:")
print(features)


Accuracy: 0.6003
Classification Report:
              precision    recall  f1-score   support

           1       0.52      0.32      0.40        34
           2       0.15      0.05      0.07        43
           3       0.54      0.65      0.59        40
           4       0.57      0.53      0.55        38
           5       0.33      0.20      0.25        40
           6       0.33      0.40      0.36        42
           7       0.71      0.73      0.72        49
           8       0.72      0.60      0.66        43
           9       0.79      0.49      0.60        39
          10       0.30      0.22      0.25        41
          11       0.43      0.52      0.47        31
          12       0.64      0.65      0.65        46
          13       0.31      0.44      0.36        25
          14       0.46      0.64      0.53        33
          15       0.67      0.72      0.69        39
          16       0.68      0.59      0.63        46
          17       0.56      0.47      0.