In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

# Set random seed for reproducibility
random_seed = 101
random.seed(random_seed)
np.random.seed(random_seed)

# Load and process training data
df_train = pd.read_csv('Train.csv')
df_train['target'] = df_train['target'].astype(str)
y_train = df_train['target']
X_train = df_train.drop(['id', 'target'], axis=1)

# Load and process validation dataset 1
df_val1 = pd.read_csv('Validation_data1.csv')
df_val1['target'] = df_val1['target'].astype(str)
y_val1 = df_val1['target']
X_val1 = df_val1.drop(['id', 'target'], axis=1)

# Load and process validation dataset 2
df_val2 = pd.read_csv('Validation_data2.csv')
df_val2['target'] = df_val2['target'].astype(str)
y_val2 = df_val2['target']
X_val2 = df_val2.drop(['id', 'target'], axis=1)

# Feature names
feature_names = X_train.columns.tolist()

# Normalize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_names)
X_val1_scaled = pd.DataFrame(scaler.transform(X_val1), columns=feature_names)
X_val2_scaled = pd.DataFrame(scaler.transform(X_val2), columns=feature_names)

# Function to remove highly correlated features using Spearman correlation
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr(method='spearman')
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                col_corr.add(corr_matrix.columns[i])
    return col_corr

# Remove highly correlated features
corr_features = correlation(X_train_scaled, threshold=0.85)
print(f"Number of features removed due to high correlation: {len(corr_features)}")

X_train_filtered = X_train_scaled.drop(columns=corr_features)
X_val1_filtered = X_val1_scaled.drop(columns=corr_features)
X_val2_filtered = X_val2_scaled.drop(columns=corr_features)

# Recursive Feature Elimination with Cross Validation
min_features_to_select = 3
cv = StratifiedKFold(n_splits=10)
svc = SVC(kernel='linear', random_state=random_seed, C=1)

selector = RFECV(estimator=svc, step=1, cv=cv, scoring='accuracy',
                 min_features_to_select=min_features_to_select)
selector.fit(X_train_filtered, y_train)

# Selected features
selected_features = X_train_filtered.columns[selector.support_]
print("Selected features:", list(selected_features))

# Transform and save selected features
def save_selected_features(X, selected_columns, filename):
    df_selected = pd.DataFrame(selector.transform(X), columns=selected_columns)
    df_selected.to_csv(filename, index=False)

save_selected_features(X_train_filtered, selected_features, 'Train_selected_features.csv')
save_selected_features(X_val1_filtered, selected_features, 'Val1_selected_features.csv')
save_selected_features(X_val2_filtered, selected_features, 'Val2_selected_features.csv')

# Plot number of features vs cross-validation scores
plt.figure(figsize=(8, 5))
plt.title('Number of Features vs Cross-Validation Score', fontsize=12)
plt.xlabel('Number of Features Selected')
plt.ylabel('Cross-Validation Accuracy')
plt.plot(
    range(min_features_to_select, len(selector.cv_results_['mean_test_score']) + min_features_to_select),
    selector.cv_results_['mean_test_score'],
    color='m', linestyle='-', linewidth=2
)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig('Feature_Selection_CV_Scores.png', dpi=600)
plt.show()
