In [1]:
%cd ..

/Users/erwan/Programmes/2022 Datacamp/solar_wind


#### Tests Régression Logistique et HistBoosts sur une base de données ré-échantillonnée
- __But__ : Observer les performances sur une base de données "plus" équilibrée
- Toutes les variables retenues dans le pré-traitement de beta_light_1 sont utilisées
- Le predict utilise un rolling (modifiable avec le paramètre smooth)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from problem import get_train_data, get_test_data
from problem import turn_prediction_to_event_list

# Fonction perso
from display import plot_event, multiple_plots, consecutive_plots, display_timeline, show_densities
from display import display_res


In [3]:
def get_fe_data(data):
    """
    Prend data_train['Beta'] au format brut (hors seuillabge ) et renvoie la série avec les traitements
    définis ci-dessous
    """

    # Moyenne mobile faible sur les valeurs pour espérer réduire dors et déjà le bruit 
    seuil = 50
    var_s = data.map(lambda x: min(x, seuil))
    var_s = var_s.rolling('30 min', center=True).mean()

    # Différence entre moyenne longue et moyenne courte (mobiles) 
    # Met en avant les transitions entre états
    var_l_2h_s = var_s.rolling('2h').mean()
    var_r_2h_s = var_s.iloc[::-1].rolling('2h').mean().iloc[::-1]

    # Ratio entre une moyenne mobile courte et une très longue
    # Met en avant les valeurs faibles de Beta
    var_4h_s = var_s.rolling('4h', center=True).mean()
    var_long_s = var_s.rolling('4 d', center=True).mean()

    var_l_20h_s = var_s.rolling('20h').mean()
    var_r_20h_s = var_s.iloc[::-1].rolling('20h').mean().iloc[::-1]

    df = pd.DataFrame({})
    df['base-value'] = var_s

    df['l_small_avg'] = var_l_2h_s.copy()
    df['r_small_avg'] = var_r_2h_s.copy()
    df['diff-mean'] = (df['r_small_avg'] - df['l_small_avg']).abs()

    df['mean-ratio'] = var_4h_s / var_long_s

    return df

In [4]:
# Données
data_train, labels_train = get_train_data()
data_test, labels_test = get_test_data()

# Évenements
events = turn_prediction_to_event_list(labels_train)
no_events = turn_prediction_to_event_list(labels_train == 0)

In [9]:
fe_train_data = get_fe_data(data_train['Beta'])
fe_test_data = get_fe_data(data_test['Beta'])

# Ré-échantillone pour augmenter la proportion de 1
drop_fe_train = fe_train_data[labels_train == 0].sample(frac=0.6, random_state=1)

reduce_fe_data_train = fe_train_data.drop(drop_fe_train.index)
reduce_fe_labels_train = labels_train[reduce_fe_data_train.index]

reduce_fe_data_train.head(6)

Unnamed: 0,base-value,l_small_avg,r_small_avg,diff-mean,mean-ratio
1997-10-01 00:00:00,8.395598,8.395598,9.201718,0.806121,14.461996
1997-10-01 00:50:00,14.377697,11.763342,4.731365,7.031977,10.622299
1997-10-01 01:10:00,11.088533,11.780372,2.586794,9.193578,9.632399
1997-10-01 01:40:00,2.965246,9.97249,0.779794,9.192696,8.441518
1997-10-01 02:00:00,0.50175,8.543898,0.566436,7.977462,7.601529
1997-10-01 02:40:00,0.52809,4.731365,0.622356,4.109009,4.507745


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

In [7]:
cols = ['base-value', 'mean-ratio', 'diff-mean', 'l_small_avg', 'r_small_avg']

log_model = LogisticRegression(
    penalty='l2',
    max_iter=1000,
    random_state=0,
)

print("Colonnes : ", cols)
log_model.fit(reduce_fe_data_train[cols], reduce_fe_labels_train)
print('Coefs :', log_model.coef_)
display_res(fe_test_data[cols], labels_test, smooth=True, models=[log_model])

Colonnes :  ['base-value', 'mean-ratio', 'diff-mean', 'l_small_avg', 'r_small_avg']
Coefs : [[ 1.27625424  0.27614672  5.50891476 -5.87439751 -5.85802273]]
Loss : 0.17030031958611996
              precision    recall  f1-score   support

           0       0.98      0.96      0.97    191755
           1       0.56      0.68      0.61     13819

    accuracy                           0.94    205574
   macro avg       0.77      0.82      0.79    205574
weighted avg       0.95      0.94      0.94    205574

ev_prec 0.2987012987012987
ev_rec 0.6542056074766356
-------------


In [8]:
cols = ['base-value', 'mean-ratio', 'diff-mean', 'l_small_avg', 'r_small_avg']

histboost_model = HistGradientBoostingClassifier(
    max_iter=1000,
    random_state=0,
)

print("Colonnes : ", cols)
histboost_model.fit(reduce_fe_data_train[cols], reduce_fe_labels_train)
display_res(fe_test_data[cols], labels_test, smooth=True, models=[histboost_model])

Colonnes :  ['base-value', 'mean-ratio', 'diff-mean', 'l_small_avg', 'r_small_avg']
Loss : 0.14957351567945368
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    191755
           1       0.60      0.64      0.62     13819

    accuracy                           0.95    205574
   macro avg       0.78      0.80      0.79    205574
weighted avg       0.95      0.95      0.95    205574

ev_prec 0.3037383177570093
ev_rec 0.6168224299065421
-------------
