In [3]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
save_directory = "my_datasets"
import time
import concurrent.futures
from sklearn.preprocessing import LabelEncoder

In [4]:
list_df_classif=['adult', 'aloi',  'churn',   'eye',  'gesture', 'helena', 'higgs-small',  'jannis',  'otto']

In [13]:
list_df_regr=['california_housing', 'microsoft','year', 'house']

## Отбор признаков с помощью grow shrink

In [None]:
from grow_shrink import GrowShrink

In [None]:
CI_tests = [PartialCorrelation()]  
p_values = [0.01, 0.05, 0.1]



dataset_configs = ['0', '01', '02', '03']

gs_results = []
timeout = 10 * 60  


def gs_algorithm_running(X_train, X_test, y_train, y_test, CI_test, pva):
    start_time = time.time()
    
    y_train_df = pd.DataFrame(y_train, columns=['target'])
    train_df = pd.concat([X_train, y_train_df], axis=1)
    
    data_array = train_df.values
    var_names = train_df.columns.tolist()
    
    StandardizeTransform_ = StandardizeTransform()
    StandardizeTransform_.fit(data_array)
    data = StandardizeTransform_.transform(data_array)
    
    target_var = 'target'
    gs = GrowShrink(data=data, CI_test=ci_test)
    selected_features = gs.run(target_var='target', pvalue_thres=pva)


    
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return selected_features, elapsed_time


def run_with_timeout(func, *args):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.submit(func, *args)
        try:
            result = future.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            return ["max"], "max"
        return result

In [None]:
gs_results=[]
for dataset in list_df_regr:
    print(dataset)
    
   
    for config in dataset_configs:
        print(f"Running for dataset config: {config}")
        
        X_train = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_X_train.csv")
        X_test = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_X_test.csv")
        y_train = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_y_train.csv").values.ravel()
        y_test = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_y_test.csv").values.ravel()
        
        print(X_train.shape)
        
        for ci_test in CI_tests:
            print(ci_test.__class__.__name__)
            for p_value in p_values:
                
                feature_gs, gs_time = run_with_timeout(gs_algorithm_running, X_train, X_test, y_train, y_test, ci_test, p_value)

                
                gs_results.append({
                    'dataset': dataset,
                    'config': config,
                    'CI_test': ci_test.__class__.__name__,
                    'p_value': p_value,
                    'gs_features': feature_gs,
                    'gs_time': gs_time,
                    'n_features': len(feature_gs) if feature_gs != ["max"] else "max"
                })


results_df = pd.DataFrame(gs_results)


results_df.to_csv('features_list_gs_regr.csv', index=False)

# Алгоритмы отбора признаков для сравнения

### Алгоритмы отбора признаков адаптированные под задачу классификации

In [None]:
def algo_1 (X_train, X_test, y_train, y_test):

    from sklearn.feature_selection import GenericUnivariateSelect, f_classif

    start_time = time.time()

    
    transformer = GenericUnivariateSelect(score_func=f_classif, mode='fpr', param=0.01)

    
    X_new = transformer.fit_transform(X_train, y_train)

    
    selected_features = X_train.columns[transformer.get_support()].tolist()
    
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return selected_features, elapsed_time

In [None]:
def algo_lasso (X_train, X_test, y_train, y_test):
    start_time = time.time()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)

   
    lasso = Lasso(alpha=0.1) 
    lasso.fit(X_scaled, y_train)

    nonzero_coefficients = lasso.coef_ != 0
    selected_features = X_train.columns[nonzero_coefficients].tolist()
    
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return selected_features, elapsed_time

In [None]:
def xgb_f (X_train, X_test, y_train, y_test, n_features):
    start_time = time.time()
    from sklearn.preprocessing import LabelEncoder

     
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    
    
    unseen_labels = set(y_test) - set(le.classes_)
    if unseen_labels:
        mask = ~np.isin(y_test, list(unseen_labels))
        X_test = X_test[mask]
        y_test = y_test[mask]
    
    y_test = le.transform(y_test)
    
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    importances = model.feature_importances_

    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importances
    })

    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    importance_df=importance_df.head(n_features)
    feature_list = importance_df['Feature'].tolist()

    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return feature_list, elapsed_time

In [None]:
def random_forest (X_train, X_test, y_train, y_test, n_features):
    start_time = time.time()

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    importances = model.feature_importances_

    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importances
    })

    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    importance_df=importance_df.head(n_features)
    feature_list = importance_df['Feature'].tolist()

    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return feature_list, elapsed_time

### Алгоритмы отбора признаков адаптированные под задачу регрессии

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestRegressor


def algo_1(X_train, X_test, y_train, y_test):
    from sklearn.feature_selection import GenericUnivariateSelect, f_regression

    start_time = time.time()

    
    transformer = GenericUnivariateSelect(score_func=f_regression, mode='fpr', param=0.01)
    
    X_new = transformer.fit_transform(X_train, y_train)

    selected_features = X_train.columns[transformer.get_support()].tolist()
    
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return selected_features, elapsed_time


def algo_lasso(X_train, X_test, y_train, y_test):
    start_time = time.time()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)

    lasso = Lasso(alpha=0.1) 
    lasso.fit(X_scaled, y_train)

    nonzero_coefficients = lasso.coef_ != 0
    selected_features = X_train.columns[nonzero_coefficients].tolist()
    
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return selected_features, elapsed_time

def xgb_f(X_train, X_test, y_train, y_test, n_features):
    start_time = time.time()

    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)

    importances = model.feature_importances_

    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importances
    })

    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    importance_df = importance_df.head(n_features)
    feature_list = importance_df['Feature'].tolist()

    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return feature_list, elapsed_time

def random_forest(X_train, X_test, y_train, y_test, n_features):
    start_time = time.time()


    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    importances = model.feature_importances_

    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importances
    })

    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    importance_df = importance_df.head(n_features)
    feature_list = importance_df['Feature'].tolist()

    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return feature_list, elapsed_time


In [None]:
import pandas as pd
import time
\
gs_results_df = pd.read_csv('features_list_gs.csv')


algo1_results = []
lasso_results = []
xgb_results = []
rf_results = []


for idx, row in gs_results_df.iterrows():
    dataset = row['dataset']
    config = row['config']
    p_value = row['p_value']
    
    n_features = row['n_features']  
    
    
    if n_features == 'max':
        algo1_results.append({'dataset': dataset, 'config': config, 'selected_features': 0, 'time': 0})
        lasso_results.append({'dataset': dataset, 'config': config, 'selected_features': 0, 'time': 0})
        xgb_results.append({'dataset': dataset, 'config': config, 'selected_features': 0, 'time': 0})
        rf_results.append({'dataset': dataset, 'config': config, 'selected_features': 0, 'time': 0})
        continue  
    
    
    n_features = int(n_features)
    
    if config == 0:
        config = '0'
    else:
        config = f"0{config:.0f}"
        print(config)
    
    
    if p_value == 0.1:
        print(f"Running for dataset: {dataset}, config: {config}, p_value: {p_value}")
        
        X_train = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_X_train.csv")
        X_test = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_X_test.csv")
        y_train = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_y_train.csv").values.ravel()
        y_test = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_y_test.csv").values.ravel()
        

        selected_features_algo1, algo1_time = algo_1(X_train, X_test, y_train, y_test)
        algo1_results.append({'dataset': dataset, 'config': config, 'selected_features': selected_features_algo1, 'time': algo1_time})

        selected_features_lasso, lasso_time = algo_lasso(X_train, X_test, y_train, y_test)
        lasso_results.append({'dataset': dataset, 'config': config, 'selected_features': selected_features_lasso, 'time': lasso_time})

        selected_features_xgb, xgb_time = xgb_f(X_train, X_test, y_train, y_test, n_features)
        xgb_results.append({'dataset': dataset, 'config': config, 'selected_features': selected_features_xgb, 'time': xgb_time})

        selected_features_rf, rf_time = random_forest(X_train, X_test, y_train, y_test, n_features)
        rf_results.append({'dataset': dataset, 'config': config, 'selected_features': selected_features_rf, 'time': rf_time})

algo1_df = pd.DataFrame(algo1_results)
lasso_df = pd.DataFrame(lasso_results)
xgb_df = pd.DataFrame(xgb_results)
rf_df = pd.DataFrame(rf_results)

algo1_df.to_csv('algo1_results.csv', index=False)
lasso_df.to_csv('lasso_results.csv', index=False)
xgb_df.to_csv('xgb_results.csv', index=False)
rf_df.to_csv('rf_results.csv', index=False)

print("Feature selection results saved.")


# Модели классификации для проверки качества

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
import lightgbm as lgb
import time

def train_mlp_classifier(X_train, X_test, y_train, y_test, feature_subset=None):
    if feature_subset is not None:
        X_train = X_train[feature_subset]
        X_test = X_test[feature_subset]
    
    start_time = time.time()
    
    mlp = MLPClassifier(max_iter=200, random_state=42)
    mlp.fit(X_train, y_train)
    
    y_pred = mlp.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')

    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return f1, elapsed_time


def train_lgbm_classifier(X_train, X_test, y_train, y_test, feature_subset=None):
    if feature_subset is not None:
        X_train = X_train[feature_subset]
        X_test = X_test[feature_subset]

    start_time = time.time()

    lgbm = lgb.LGBMClassifier(max_iter=200, random_state=42, verbosity=-1)
    lgbm.fit(X_train, y_train)

    y_pred = lgbm.predict(X_test)
    f1 = f1_score(y_test, y_pred,  average='macro')
    
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    
    return f1, elapsed_time

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

gs_results_df = pd.read_csv('features_list_gs.csv')


mlp_f1_scores = []
mlp_times = []
lgbm_f1_scores = []
lgbm_times = []

for idx, row in gs_results_df.iterrows():
    dataset = row['dataset']
    config = row['config']
    gs_features = row['gs_features']  
    

    if config == 0:
        config_str = '0'
    else:
        config_str = f"0{config:.0f}"


    X_train = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config_str}_X_train.csv")
    X_test = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config_str}_X_test.csv")
    

    y_train = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config_str}_y_train.csv").values.ravel()
    y_test = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config_str}_y_test.csv").values.ravel()


    if gs_features == "[]" or gs_features == "max":
        selected_features = None  
    else:
        selected_features = gs_features.strip('[]').replace("'", "").split(', ')


    if selected_features is not None:
        X_train_selected = X_train[selected_features]
        X_test_selected = X_test[selected_features]
    else:
        X_train_selected = X_train
        X_test_selected = X_test
        
    print(selected_features)


    mlp_f1, mlp_time = train_mlp_regressor(X_train_selected, X_test_selected, y_train, y_test)
    lgbm_f1, lgbm_time = train_lgbm_regressor(X_train_selected, X_test_selected, y_train, y_test)


    mlp_f1_scores.append(mlp_f1)
    mlp_times.append(mlp_time)
    lgbm_f1_scores.append(lgbm_f1)
    lgbm_times.append(lgbm_time)

gs_results_df['mlp_f1'] = mlp_f1_scores
gs_results_df['mlp_time'] = mlp_times
gs_results_df['lgbm_f1'] = lgbm_f1_scores
gs_results_df['lgbm_time'] = lgbm_times


gs_results_df.to_csv('features_list_gs_with_configs_and_f1.csv', index=False)

print("F1 scores and times added to the dataset and saved.")

# Модели регрессии для оценки качества

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
import lightgbm as lgb
import xgboost as xgb

def train_mlp_regressor(X_train, X_test, y_train, y_test, feature_subset=None):
    if feature_subset is not None:
        X_train = X_train[feature_subset]
        X_test = X_test[feature_subset]


    param_grid = {
        'alpha': [ 0.001, 0.01],
        'learning_rate_init': [ 0.01, 0.1],
        'max_iter': [1000]  
    }

    
    start_time = time.time()
    
    mlp = MLPRegressor(random_state=42)
    grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_mlp = grid_search.best_estimator_

    y_pred = best_mlp.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rmse = np.sqrt(mse)

    end_time = time.time()

    elapsed_time = end_time - start_time
    
    print(f"MLP RMSE: {rmse}")
    
    return round(rmse, 2), round(elapsed_time, 2)


def train_lgbm_regressor(X_train, X_test, y_train, y_test, feature_subset=None):
    if feature_subset is not None:
        X_train = X_train[feature_subset]
        X_test = X_test[feature_subset]

    param_grid = {
        'n_estimators': [500],
        'learning_rate': [ 0.05, 0.1],
        'num_leaves': [ 100]
    }


    start_time = time.time()

    lgbm = lgb.LGBMRegressor(random_state=42, verbosity=-1)
    grid_search = GridSearchCV(lgbm, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_lgbm = grid_search.best_estimator_

    y_pred = best_lgbm.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rmse = np.sqrt(mse)

    end_time = time.time()

    elapsed_time = end_time - start_time
    
    print(f"LGBM RMSE: {rmse}")
    
    return round(rmse, 2), round(elapsed_time, 2)


def train_xgb_regressor(X_train, X_test, y_train, y_test, feature_subset=None):
    if feature_subset is not None:
        X_train = X_train[feature_subset]
        X_test = X_test[feature_subset]
    

    param_grid = {
        'n_estimators': [500],
        'learning_rate': [ 0.05, 0.1],
        'max_depth': [6]
    }

    start_time = time.time()

    xg_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, verbosity=0)
    grid_search = GridSearchCV(xg_reg, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_xg = grid_search.best_estimator_

    y_pred = best_xg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    rmse = np.sqrt(mse)

    end_time = time.time()

    elapsed_time = end_time - start_time
    
    print(f"XGBoost RMSE: {rmse}")
    
    return round(rmse, 2), round(elapsed_time, 2)


def train_linear_regressor(X_train, X_test, y_train, y_test, feature_subset=None):
    if feature_subset is not None:
        X_train = X_train[feature_subset]
        X_test = X_test[feature_subset]

    start_time = time.time()
    

    lin_reg = Ridge(alpha=1.0)  
    lin_reg.fit(X_train, y_train)
    
    y_pred = lin_reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    rmse = np.sqrt(mse)

    end_time = time.time()

    elapsed_time = end_time - start_time
    
    print(f"Linear Regression RMSE: {rmse}")
    
    return round(rmse, 2), round(elapsed_time, 2)

In [None]:
import pandas as pd


dataset_configs = ['0', '01', '02', '03']

mlp_rmse_scores = []
mlp_times = []
lgbm_rmse_scores = []
lgbm_times = []
xgb_rmse_scores = []
xgb_times = []
linreg_rmse_scores = []
linreg_times = []
datasets = []
configs = []

for dataset in list_df_regr:
    for config in dataset_configs:
        
        X_train = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_X_train.csv")
        X_test = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_X_test.csv")
        y_train = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_y_train.csv").values.ravel()
        y_test = pd.read_csv(f"{save_directory}/{dataset}_secondorder_{config}_y_test.csv").values.ravel()

        
        mlp_rmse, mlp_time = train_mlp_regressor(X_train, X_test, y_train, y_test)
        
        
        lgbm_rmse, lgbm_time = train_lgbm_regressor(X_train, X_test, y_train, y_test)

        
        xgb_rmse, xgb_time = train_xgb_regressor(X_train, X_test, y_train, y_test)

        
        linreg_rmse, linreg_time = train_linear_regressor(X_train, X_test, y_train, y_test)

        
        datasets.append(dataset)
        configs.append(config)
        mlp_rmse_scores.append(mlp_rmse)
        mlp_times.append(mlp_time)
        lgbm_rmse_scores.append(lgbm_rmse)
        lgbm_times.append(lgbm_time)
        xgb_rmse_scores.append(xgb_rmse)
        xgb_times.append(xgb_time)
        linreg_rmse_scores.append(linreg_rmse)
        linreg_times.append(linreg_time)


results_df = pd.DataFrame({
    'dataset': datasets,
    'config': configs,
    'mlp_rmse': mlp_rmse_scores,
    'mlp_time': mlp_times,
    'lgbm_rmse': lgbm_rmse_scores,
    'lgbm_time': lgbm_times,
    'xgb_rmse': xgb_rmse_scores,
    'xgb_time': xgb_times,
    'linreg_rmse': linreg_rmse_scores,
    'linreg_time': linreg_times
})


results_df.to_csv('mlp_lgbm_xgb_linreg_rmse_results_regr_with_configs.csv', index=False)

print("RMSE scores and times saved for MLP, LGBM, XGBoost, and Linear Regression for all datasets and configurations.")


