In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [195]:
import resource

memory_limit = 100 * 1024**3 # 100GB
resource.setrlimit(resource.RLIMIT_AS, (memory_limit, memory_limit))

In [None]:
df = pd.read_csv('data/separated_datasets/dataset_18.csv')
groups = pd.read_csv('data/Heterotic_groups.csv')

In [198]:
X = df
y = groups

In [None]:
all_feature_importances = []
significant_features = []
total_features = X.shape[1]


# Division into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initiate the model
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.05,
    depth=4,
    l2_leaf_reg=10,
    loss_function='MultiClass',
    eval_metric='TotalF1',
    cat_features=X.columns.tolist(),  
    verbose=100,
    random_seed=42,
    early_stopping_rounds=20,
    used_ram_limit='32gb',
    thread_count=1,
    bagging_temperature = 0.8)

# Training
model.fit(X_train, y_train)

# Getting the importance of features
importances = model.get_feature_importance(type='FeatureImportance')
feature_series = pd.Series(importances, index=X.columns)
all_feature_importances.append(feature_series)

# Z-score normalization
mean = feature_series.mean()
std = feature_series.std()
z_scores = (feature_series - mean) / std

# Selection of features with |z| > 3
passed = feature_series[np.abs(z_scores) > 3]
if not passed.empty:
    significant_features.append(passed)

# Merging into a single DataFrame
importances_df = pd.concat(all_feature_importances, axis=0)

# Values that have passed the 3 sigma threshold
if significant_features:
    significant_features_df = pd.concat(significant_features, axis=0)
else:
    significant_features_df = pd.Series([], dtype=float)

0:	learn: 0.4173005	total: 15s	remaining: 24m 46s
99:	learn: 0.9958739	total: 25m 35s	remaining: 0us


In [None]:
# Create DataFrame with significant features
significant_features_df = significant_features_df.reset_index()
significant_features_df.columns = ['Feature', 'Importance']
significant_features_df_sorted = significant_features_df.sort_values(by='Importance', ascending=False)

In [205]:
significant_features_df_sorted.to_csv('/mnt/users/ib_2025/Corn_ML_project/data/importances/catboost_significant_importances_df18.csv', index=False)