In [1]:
%cd ..

/Users/erwan/Programmes/2022 Datacamp/solar_wind


#### Tests Régression Logistique sur deux pré-traitements de données différents
- __But__ : Conclure sur l'importante de certaines données sur les performances du modèle
- Pré-traitement de beta_light_1, comparaison entre avec et sans r_small et l_small
- Pas de ré-échantillonnage des données
- Le predict utilise un rolling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, classification_report

from problem import get_train_data, get_test_data
from problem import turn_prediction_to_event_list

# Fonction perso
from display import plot_event, multiple_plots, consecutive_plots, display_timeline, show_densities
from display import display_res


In [3]:
def fe_data_v1(data):
    """
    Prend data_train['Beta'] au format brut (hors seuillabge ) et renvoie la série avec les traitements
    définis ci-dessous
    """
    # Préparation des données 
    # Moyenne mobile faible sur les valeurs pour espérer réduire dors et déjà le bruit
    seuil = 50
    var_s = data['Beta'].map(lambda x: min(x, seuil))
    var_s = var_s.rolling('30 min', center=True).mean()

    var_small_s = var_s.rolling('4h', center=True).mean()
    var_long_s = var_s.rolling('4 d', center=True).mean()

    var_l_small_s = var_s.rolling('3h').mean()
    var_r_small_s = var_s.iloc[::-1].rolling('3h').mean().iloc[::-1]


    df = pd.DataFrame({})
    df['base'] = var_s.copy() 

    df['l_small_avg'] = var_l_small_s
    df['r_small_avg'] = var_r_small_s

    df['diff-mean'] = df['r_small_avg'] - df['l_small_avg']
    df['mean-ratio'] = var_small_s / var_long_s # valeurs centrées

    return df

def fe_data_v2(data):

    seuil = 50
    var_s = data['Beta'].map(lambda x: min(x, seuil))
    var_s = var_s.rolling('30 min', center=True).mean()

    # Étude des évolutions avec les calculs d'écarts centrés
    # Petite moyenne, grande moyenne, mean-ratio et écart type sur petite moyenne 
    var_small_s = var_s.rolling('5 h', center=True).mean()
    var_long_s = var_s.rolling('4 d', center=True).mean()

    df = pd.DataFrame({})
    df['base'] = var_s
    df['small_avg'] = var_small_s.copy()
    df['global_avg'] = var_long_s.copy()

    df['mean-ratio'] = df['small_avg'] / df['global_avg']
    df['std'] = df['small_avg'].rolling('20 h', center=True).std().fillna(method='bfill')

    return df

In [4]:
# Données
data_train, labels_train = get_train_data()
data_test, labels_test = get_test_data()

# Évenements
events = turn_prediction_to_event_list(labels_train)
no_events = turn_prediction_to_event_list(labels_train == 0)

In [5]:
fe_data_train_v1 = fe_data_v1(data_train)
fe_data_test_v1 = fe_data_v1(data_test)

fe_data_train_v2 = fe_data_v2(data_train)
fe_data_test_v2 = fe_data_v2(data_test)

In [6]:
fe_data_train_v1.head(2)

Unnamed: 0,base,l_small_avg,r_small_avg,diff-mean,mean-ratio
1997-10-01 00:00:00,8.395598,8.395598,6.308409,-2.087189,14.461996
1997-10-01 00:10:00,9.571798,8.983698,5.877799,-3.105899,13.486491


In [7]:
fe_data_test_v2.head(2)

Unnamed: 0,base,small_avg,global_avg,mean-ratio,std
2008-01-01 00:00:00,0.903901,0.837953,1.996491,0.419713,0.139008
2008-01-01 00:10:00,0.895658,0.862736,1.990628,0.433399,0.144529


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

In [9]:
log_model = LogisticRegression(
    penalty="l2",
    max_iter=1000,
    random_state=0,
    solver='lbfgs'
)

In [10]:
# Bdd 1
print(fe_data_train_v1.drop(columns=['diff-mean', 'mean-ratio']).columns)
log_model.fit(fe_data_train_v1.drop(columns=['diff-mean', 'mean-ratio']), labels_train)

display_res(fe_data_test_v1.drop(columns=['diff-mean', 'mean-ratio']), labels_test, smooth=False, models=[log_model])

Index(['base', 'l_small_avg', 'r_small_avg'], dtype='object')
Loss : 0.14240060109566596
              precision    recall  f1-score   support

           0       0.94      1.00      0.97    191755
           1       0.98      0.14      0.25     13819

    accuracy                           0.94    205574
   macro avg       0.96      0.57      0.61    205574
weighted avg       0.94      0.94      0.92    205574

ev_prec 0.2222222222222222
ev_rec 0.03738317757009346
-------------


In [11]:
# Bdd 2 
print(fe_data_train_v2.columns)
log_model.fit(fe_data_train_v2, labels_train)


display_res(fe_data_test_v2, labels_test, smooth=False, models=[log_model])

Index(['base', 'small_avg', 'global_avg', 'mean-ratio', 'std'], dtype='object')
Loss : 0.14341102610252973
              precision    recall  f1-score   support

           0       0.94      1.00      0.97    191755
           1       0.91      0.13      0.23     13819

    accuracy                           0.94    205574
   macro avg       0.93      0.57      0.60    205574
weighted avg       0.94      0.94      0.92    205574

ev_prec 0.19999999999999996
ev_rec 0.028037383177570097
-------------
