In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import os

# Define your desired path
path = '/content/drive/My Drive/Hack@CSI/20242111'

# Create the directory if it doesn't exist
os.makedirs(path, exist_ok=True)

# Change the current working directory to the new path
os.chdir(path)

# Verify the current working directory
print("Current Working Directory:", os.getcwd())

Current Working Directory: /content/drive/My Drive/Hack@CSI/20242111


In [3]:
import numpy as np
import pandas as pd
import sys
import logging
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from datasets import import_accelerometer
from sklearn.metrics import roc_auc_score
import seaborn as sns

In [4]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = import_accelerometer()

  values = array(values, copy=False, ndmin=arr.ndim, dtype=arr.dtype)


# Feature Selection

We are going to exploit the same models: Random Forests, Gradient Boosting, Logistic Regression

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

def backward_selection(X, y):
    """
    Perform backward selection to identify the best features for classification.

    Parameters:
    - X: DataFrame of features
    - y: Target array with class labels

    Returns:
    - selected_features: List of selected feature names
    """
    features = X.columns.tolist()
    selected_features = features.copy()
    best_score = -np.inf

    while len(selected_features) > 1:
        scores = {}
        for feature in selected_features:
            subset_features = [f for f in selected_features if f != feature]
            X_subset = X[subset_features]
            model = RandomForestClassifier(random_state=42)
            score = cross_val_score(model, X_subset, y, cv=5, scoring='accuracy').mean() #5-fold CROSS-VALIDATION
            scores[feature] = score
        worst_feature = min(scores, key=scores.get)
        if scores[worst_feature] > best_score:
            best_score = scores[worst_feature]
            selected_features.remove(worst_feature)
        else:
            break

    print("Backward Selection - Selected Features:", selected_features)
    return selected_features

def forward_selection(X, y):
    """
    Perform forward selection to identify the best features for classification.

    Parameters:
    - X: DataFrame of features
    - y: Target array with class labels

    Returns:
    - selected_features: List of selected feature names
    """
    features = X.columns.tolist()
    selected_features = []
    best_score = -np.inf

    while len(selected_features) < len(features):
        scores = {}
        for feature in features:
            if feature not in selected_features:
                subset_features = selected_features + [feature]
                X_subset = X[subset_features]
                model = RandomForestClassifier(random_state=42)
                score = cross_val_score(model, X_subset, y, cv=5, scoring='accuracy').mean() #5-fold CROSS-VALIDATION
                scores[feature] = score
        best_feature = max(scores, key=scores.get)
        if scores[best_feature] > best_score:
            best_score = scores[best_feature]
            selected_features.append(best_feature)
        else:
            break

    print("Forward Selection - Selected Features:", selected_features)
    return selected_features

# Assuming X_train contains features and Y_train is the target with class labels
selected_features_bw = backward_selection(pd.DataFrame(X_train), Y_train)
selected_features_fw = forward_selection(pd.DataFrame(X_train), Y_train)


Backward Selection - Selected Features: [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Forward Selection - Selected Features: [10, 12, 0, 8, 14, 6, 16]


In [None]:
# Convert to DataFrame if not already
if not isinstance(X_train, pd.DataFrame):
    X_train = pd.DataFrame(X_train, columns=[f"Feature_{i}" for i in range(X_train.shape[1])])
print("Column Names in X_train:")
print(X_train.columns.tolist())


Column Names in X_train:
['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Feature_11', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17']


In [None]:
def filter_features(X_train, X_val, X_test, selected_features):
    """
    Filter the datasets to include only the selected features.

    Parameters:
    - X_train, X_val, X_test: Datasets (either pandas DataFrames or NumPy arrays).
    - selected_features: List of selected feature indices.

    Returns:
    - Filtered versions of X_train, X_val, X_test.
    """
    # Ensure all datasets are DataFrames with column names
    if not isinstance(X_train, pd.DataFrame):
        X_train = pd.DataFrame(X_train, columns=[f"Feature_{i}" for i in range(X_train.shape[1])])
    if not isinstance(X_val, pd.DataFrame):
        X_val = pd.DataFrame(X_val, columns=[f"Feature_{i}" for i in range(X_val.shape[1])])
    if not isinstance(X_test, pd.DataFrame):
        X_test = pd.DataFrame(X_test, columns=[f"Feature_{i}" for i in range(X_test.shape[1])])

    # Convert selected indices to column names
    selected_feature_names = [f"Feature_{i}" for i in selected_features]

    # Filter datasets using selected features
    X_train_filtered = X_train[selected_feature_names]
    X_val_filtered = X_val[selected_feature_names]
    X_test_filtered = X_test[selected_feature_names]

    return X_train_filtered, X_val_filtered, X_test_filtered

# Example usage with selected features
selected_features = selected_features_fw  # Replace with the actual selected features list

# Filter the datasets
X_train_filtered, X_val_filtered, X_test_filtered = filter_features(X_train, X_val, X_test, selected_features)

# Verify shapes of filtered datasets
print(f"Filtered X_train shape: {X_train_filtered.shape}")
print(f"Filtered X_val shape: {X_val_filtered.shape}")
print(f"Filtered X_test shape: {X_test_filtered.shape}")


Filtered X_train shape: (1353, 7)
Filtered X_val shape: (194, 7)
Filtered X_test shape: (387, 7)


# Models **without** Feature Selection

## Random Forest

In [None]:
classifier = RandomForestClassifier(n_estimators=100, max_depth=9,random_state=3)
classifier.fit(X_train,Y_train)
classifier.score(X_test,Y_test)



0.8087855297157622

## Gradient Boosting

In [None]:
classifier = GradientBoostingClassifier(n_estimators=100, max_depth=9,random_state=3)
classifier.fit(X_train,Y_train)
classifier.score(X_test,Y_test)

0.8268733850129198

## Logistic Regression

In [None]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train,Y_train)
classifier.score(X_test,Y_test)

0.5736434108527132

# Models **with** Feature selection

### Random Forest

In [None]:
classifier = RandomForestClassifier(n_estimators=200, max_depth=10,random_state=3) #Fine-tuning manuale
classifier.fit(X_train_filtered,Y_train)
classifier.score(X_test_filtered,Y_test)

0.8165374677002584

In [None]:
classifier = RandomForestClassifier(n_estimators=100, max_depth=9,random_state=3) #Fine-tuning manuale
classifier.fit(X_train_filtered,Y_train)
classifier.score(X_test_filtered,Y_test)

0.8062015503875969

### Gradient Boosting

In [None]:
classifier = GradientBoostingClassifier(n_estimators=100, max_depth=9,random_state=3)
classifier.fit(X_train_filtered,Y_train)
classifier.score(X_test_filtered,Y_test)

0.834625322997416

### Logistic Regression

In [None]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_filtered,Y_train)
classifier.score(X_test_filtered,Y_test)

0.5426356589147286