In [89]:
import sys
from pathlib import Path
ROOT = Path.cwd().parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.data_loader import (
    load_raw_data,
    save_processed_data,
    load_splits,
    load_processed_data
)

from src.feature_engineering import (
    get_highly_correlated_pairs,
    compare_pairs_by_target_mi,
)
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Future Selection на основе Статистичиских методов

In [66]:
df = load_raw_data()
data = load_splits(scaled=False,encoded_y=True)
X_train = data["X_train"]
y_train = data["y_train"]
X_test = data["X_test"]

X_train_scaled = load_processed_data("data/processed/X_train_scaled.csv")

### Ишем мультиколлинеарные признаки, особенно важно для линейных моделей

In [18]:
pairs = get_highly_correlated_pairs(X_train, threshold=0.70)

### Решаем какой признак из удалить из пары основоваясь на Mutual Information


In [20]:
decision_table = compare_pairs_by_target_mi(pairs, X_train, y_train)


In [21]:
features_to_drop = decision_table["drop_feature"].unique()

In [69]:
X_train_cor_reduced = X_train.drop(columns=features_to_drop)
X_test_cor_reduced = X_test.drop(columns=features_to_drop)
# Сохраняем новые признаки
save_processed_data(X_train_cor_reduced, "data/processed/X_train_cor_reduced.csv")
save_processed_data(X_test_cor_reduced, "data/processed/X_test_cor_reduced.csv")



WindowsPath('C:/Users/AlexK/OneDrive/Рабочий стол/Классификаци типов личности/data/processed/X_test_cor_reduced.csv')

### На этапе статистического отбора удалено 2 признака : `public_speaking_comfort`, `alone_time_preference`.
### Новые признаки сохранены в CSV файл.

# Использую wrapper метод для отбора. RFECV

In [62]:

model = LogisticRegression(max_iter=1000,random_state=42)
rfecv = RFECV(
    estimator=model,
    step=1,
    cv=StratifiedKFold(5),
    scoring="f1_macro",
    n_jobs=-1
)

rfecv.fit(X_train_scaled,y_train)



0,1,2
,estimator  estimator: ``Estimator`` instance A supervised learning estimator with a ``fit`` method that provides information about feature importance either through a ``coef_`` attribute or through a ``feature_importances_`` attribute.,LogisticRegre...ndom_state=42)
,"step  step: int or float, default=1 If greater than or equal to 1, then ``step`` corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then ``step`` corresponds to the percentage (rounded down) of features to remove at each iteration. Note that the last iteration may remove fewer than ``step`` features in order to reach ``min_features_to_select``.",1
,"min_features_to_select  min_features_to_select: int, default=1 The minimum number of features to be selected. This number of features will always be scored, even if the difference between the original feature count and ``min_features_to_select`` isn't divisible by ``step``. .. versionadded:: 0.20",1
,"cv  cv: int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`~sklearn.model_selection.StratifiedKFold` is used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`~sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22  ``cv`` default value of None changed from 3-fold to 5-fold.",StratifiedKFo...shuffle=False)
,"scoring  scoring: str or callable, default=None Scoring method to evaluate the :class:`RFE` selectors' performance. Options: - str: see :ref:`scoring_string_names` for options. - callable: a scorer callable object (e.g., function) with signature  ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. - `None`: the `estimator`'s  :ref:`default evaluation criterion ` is used.",'f1_macro'
,"verbose  verbose: int, default=0 Controls verbosity of output.",0
,"n_jobs  n_jobs: int or None, default=None Number of cores to run in parallel while fitting across folds. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. .. versionadded:: 0.18",-1
,"importance_getter  importance_getter: str or callable, default='auto' If 'auto', uses the feature importance either through a `coef_` or `feature_importances_` attributes of estimator. Also accepts a string that specifies an attribute name/path for extracting feature importance. For example, give `regressor_.coef_` in case of :class:`~sklearn.compose.TransformedTargetRegressor` or `named_steps.clf.feature_importances_` in case of :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. If `callable`, overrides the default feature importance getter. The callable is passed with the fitted estimator and it should return importance for each feature. .. versionadded:: 0.24",'auto'

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [75]:
droped_features = X_train.columns[~rfecv.support_]


X_train_rfecv_reduced = X_train.drop(columns=droped_features)
X_test_rfecv_reduced = X_test.drop(columns=droped_features)
save_processed_data(X_train_rfecv_reduced, "data/processed/X_train_rfecv_reduced.csv")
save_processed_data(X_test_rfecv_reduced, "data/processed/X_test_rfecv_reduced.csv")

WindowsPath('C:/Users/AlexK/OneDrive/Рабочий стол/Классификаци типов личности/data/processed/X_test_rfecv_reduced.csv')

## Признаки удаленные после Использования оберточного метода: `creativity`, `emotional_stability`, `stress_handling`

# FEATURE ENGINEERING
### Создаю несколько признаков исходя из домена
### `Social_index` - Агрегированы признаки которые показывают общую «ориентацию на внешний социальный контакт». Ожидаемая связь с таргетом высше значение выше склонность к экстроверсии
### `Introversion_index` - интроверсия проявляется как предпочтение уединения, внутренней рефлексии, склонность к спокойным занятиям (чтение, рутина), а также склонность слушать, а не выступать. 
### `IE_balance` и `IE_abs_balance` — разница между показателями экстраверсии и интроверсии прямо отражает направление индивидуальных предпочтений. Эти признаки показывают склонность к типу личности ie_balance > 0 → склонность к экстраверсии,ie_balance < 0 → склонность к интроверсии.
### `Activity index` - активность и поиски стимуляции (sensation seeking) коррелируют с экстраверсией, но также частично независимы (связь с импульсивностью). Этот индекс полезен для разделения «социальной» и «поведенческой» составляющих экстраверсии.


In [85]:
Features = load_processed_data("data/processed/Features.csv")

In [87]:
new_features = Features.copy()

In [90]:

new_features['social_index'] = (
    new_features['social_energy'] + new_features['group_comfort'] + new_features['party_liking']
    + new_features['friendliness'] + new_features['talkativeness']
) / 5

new_features['introversion_index'] = (
    new_features['alone_time_preference'] + new_features['deep_reflection']
    + new_features['reading_habit'] + new_features['routine_preference'] + new_features['listening_skill']
) / 5

new_features['ie_balance'] = new_features['social_index'] - new_features['introversion_index']
new_features['ie_abs_balance'] = np.abs(new_features['ie_balance'])  # малое значение -> ближе к амбиверсии

new_features['activity_index'] = (
    new_features['sports_interest'] + new_features['adventurousness'] + new_features['excitement_seeking']
    + new_features['risk_taking'] + new_features['travel_desire']
) / 5


In [92]:
save_processed_data(new_features, "data/processed/new_features.csv")

WindowsPath('C:/Users/AlexK/OneDrive/Рабочий стол/Классификаци типов личности/data/processed/new_features.csv')