In [2]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
import shap
from sklearn.metrics import accuracy_score
import numpy as np

# Load the training and testing datasets
train_df = pd.read_csv('/content/portable_executable.csv')
test_df = pd.read_csv('/content/test.csv')

train_df = train_df.drop(['SHA256'], axis=1)
# Define features and target variable
X_train = train_df.drop('Type', axis=1)
y_train = train_df['Type']

# Extract features present in both training and testing datasets
common_features = list(set(X_train.columns) & set(test_df.columns))
X_train = X_train[common_features]
extracted_test_df = test_df[common_features]  # Select common features from test.csv
extracted_test_df.to_csv('extracted_test_records.csv', index=False)  # Save to a new file

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train AdaBoost model
model = AdaBoostClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on validation data
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"AdaBoost Accuracy: {accuracy}")

# Apply SHAP for feature importance
explainer = shap.Explainer(model.predict, X_train)
shap_values = explainer(X_val)

# Extract important features based on SHAP values
important_features = X_val.columns[np.abs(shap_values.values).mean(0).argsort()[::-1]]

# Extract selected features from the training dataset
extracted_features_df = train_df[important_features]

# Save extracted features to a CSV file
extracted_features_df.to_csv('extracted_features.csv', index=False)

AdaBoost Accuracy: 0.5932536141352847


PermutationExplainer explainer: 5604it [30:09,  3.09it/s]


In [3]:
extracted_df=pd.read_csv('/content/extracted_features.csv')
extracted_df.shape

(28014, 142)

In [4]:
extracted_test=pd.read_csv('/content/extracted_test_records.csv')
extracted_test.shape

(1480, 142)

In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
# --- Preprocessing for extracted_features_type.csv ---
extracted_features_df = pd.read_csv('/content/extracted_features_type.csv')

# Identify categorical features for preprocessing
categorical_features_extracted = extracted_features_df.select_dtypes(include=['object']).columns.tolist()
  # Exclude 'Type' if it's your target

# One-Hot Encoding for extracted features
ohe_extracted = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data_extracted = ohe_extracted.fit_transform(extracted_features_df[categorical_features_extracted])
encoded_df_extracted = pd.DataFrame(encoded_data_extracted, columns=ohe_extracted.get_feature_names_out(categorical_features_extracted))
extracted_features_df = extracted_features_df.drop(categorical_features_extracted, axis=1)
extracted_features_df = pd.concat([extracted_features_df, encoded_df_extracted], axis=1)

# Save preprocessed extracted features
extracted_features_df.to_csv('preprocessed_extracted_features.csv', index=False)

# --- Preprocessing for extracted_test_records.csv ---
extracted_test_df = pd.read_csv('/content/extracted_test_records.csv')

# Identify categorical features for preprocessing
categorical_features_test = extracted_test_df.select_dtypes(include=['object']).columns.tolist()

# One-Hot Encoding for test records
encoded_data_test = ohe_extracted.transform(extracted_test_df[categorical_features_test])  # Use the same OHE object
encoded_df_test = pd.DataFrame(encoded_data_test, columns=ohe_extracted.get_feature_names_out(categorical_features_test))
extracted_test_df = extracted_test_df.drop(categorical_features_test, axis=1)
extracted_test_df = pd.concat([extracted_test_df, encoded_df_test], axis=1)

# Save preprocessed test records

In [7]:
# Load the preprocessed extracted features
preprocessed_df = pd.read_csv('/content/preprocessed_extracted_features.csv')

# Sample 500 records from each 'Type' class
sampled_df = preprocessed_df.groupby('Type').apply(lambda x: x.sample(n=200, random_state=42)).reset_index(drop=True)

# Save the sampled data to a new CSV file
sampled_df.to_csv('sampled_extracted_features_finall.csv', index=False)

  sampled_df = preprocessed_df.groupby('Type').apply(lambda x: x.sample(n=200, random_state=42)).reset_index(drop=True)
