In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier
from sklearn.feature_selection import RFE
import joblib

In [22]:
# Load the dataset
data = pd.read_csv('c:/Users/USER/Documents/Data Science Journy/StackingOptimization/driving_data.csv')

# Assume target variable is in between, adjust accordingly
target_column = 'Class'  # Replace with the actual target column name
X = data.drop(columns=[target_column])  # Features
y = data[target_column]   # Target (Driver ID)

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
# Feature Selection using Recursive Feature Elimination (RFE) with Random Forest
# Use Random Forest as the base model for RFE
base_model_for_rfe = RandomForestClassifier(n_estimators=100, random_state=42)

# Specify the number of features to select
n_features_to_select = 10  # You can adjust this based on your dataset

# Initialize RFE
rfe = RFE(estimator=base_model_for_rfe, n_features_to_select=n_features_to_select)

# Fit RFE on the training data
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Get the selected feature indices
selected_features = rfe.support_
print("Selected Features:", selected_features)

# Print the names of the selected features (if feature names are available)
feature_names = X.columns  # Use correct feature names
selected_feature_names = feature_names[selected_features]
print("Selected Feature Names:", selected_feature_names.tolist())

Selected Features: [False False False False  True False  True  True False False False  True
 False False  True False False False False False False False  True False
 False False False False False False False False False False  True  True
 False False False False False False False False False False False False
 False False False  True  True]
Selected Feature Names: ['Intake_air_pressure', 'Absolute_throttle_position', 'Engine_soacking_time', 'Long_Term_Fuel_Trim_Bank1', 'Torque_of_friction', 'Maximum_indicated_engine_torque', 'Engine_coolant_temperature.1', 'Wheel_velocity_front_left-hand', 'Time(s)', 'PathOrder']


In [None]:
# Define base models for stacking
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression()),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('nb', GaussianNB())
]

# Define meta-classifier
meta_classifier = LogisticRegression()

# Create the stacking ensemble classifier
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_classifier, cv=5)

# Train the stacking classifier on the selected features
stacking_classifier.fit(X_train_rfe, y_train)

# Make predictions on the test set
y_pred = stacking_classifier.predict(X_test_rfe)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy after RFE: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')