# ⚙️ Initiez-vous au MLOps (partie 1/2)

## 📐Features engineering
### 🛠️ Préparez l'environnement de travail
#### 📦 Import des modules python

In [19]:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

import pandas as pd
import numpy as np
from src.visualization.visu_text import print_title, print_end, print_col, quick_df_info
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../data/processed/survey_lung_cancer_clean.csv")


🎯 Features prioritaires (top 10)

SMOKING_x_AGE - Risque augmente exponentiellement avec l'âge
RESPIRATORY_SYMPTOMS - Agrégation des 3 symptômes clés
TOTAL_SYMPTOMS - Vue d'ensemble de la santé
HIGH_RISK_PROFILE - Profil démographique à risque
CANCER_TRIAD - Les 3 symptômes classiques ensemble
SMOKER_WITH_RESP_SYMPTOMS - Interaction critique
SMOKING_x_ALCOHOL - Facteurs de risque combinés
SEVERE_SYMPTOMS - Indicateur de gravité
AGE_SQUARED - Capture la non-linéarité
BEHAVIORAL_RISK_SCORE - Facteurs comportementaux

In [20]:
# Age
df['SMOKING_x_AGE'] = df['SMOKING'] * df['AGE']

# Combinaison tabac + alcool
df['SMOKING_x_ALCOHOL'] = (df['SMOKING'] * df["ALCOHOL CONSUMING"]).astype(bool)

# Symptômes respiratoires combinés
df['RESPIRATORY_SYMPTOMS'] = (df['WHEEZING'] + df['COUGHING'] + 
                               df['SHORTNESS OF BREATH']).clip(0, 3)

# Score de symptômes totaux
df['TOTAL_SYMPTOMS'] = (df['YELLOW_FINGERS'] + df['ANXIETY'] + 
                        df['FATIGUE'] + df['ALLERGY'] + df['WHEEZING'] + 
                        df['COUGHING'] + df['SHORTNESS OF BREATH'] + 
                        df['SWALLOWING DIFFICULTY'] + df['CHEST PAIN'])

# Score de facteurs de risque comportementaux
df['BEHAVIORAL_RISK_SCORE'] = (df['SMOKING'] + df['ALCOHOL CONSUMING'] + 
                                df['PEER_PRESSURE'])

# Score de symptômes graves
df['SEVERE_SYMPTOMS'] = (df['CHEST PAIN'] + df['SWALLOWING DIFFICULTY'] + 
                          df['SHORTNESS OF BREATH'])

# Catégories d'âge
df['AGE_GROUP'] = pd.cut(df['AGE'], bins=[0, 50, 60, 70, 100], 
                         labels=['<50', '50-60', '60-70', '70+'])

# Risque élevé : homme + fumeur + âge > 60
df['HIGH_RISK_PROFILE'] = ((df['GENDER'] == 1) & 
                            (df['SMOKING'] == 1) & 
                            (df['AGE'] > 60)).astype(bool)

# Âge au carré (relation non-linéaire)
df['AGE_SQUARED'] = df['AGE'] ** 2

# Triade classique du cancer du poumon
df['CANCER_TRIAD'] = ((df['COUGHING'] == 1) & 
                      (df['CHEST PAIN'] == 1) & 
                      (df['SHORTNESS OF BREATH'] == 1)).astype(bool)

# Fumeur avec symptômes respiratoires
df['SMOKER_WITH_RESP_SYMPTOMS'] = (df['SMOKING'] * df['RESPIRATORY_SYMPTOMS']).astype(bool)

# Symptômes avancés (dysphagie + douleur thoracique)
df['ADVANCED_SYMPTOMS'] = (df['SWALLOWING DIFFICULTY'] * df['CHEST PAIN']).astype(bool)

# Ratio symptômes / âge (normalisation)
df['SYMPTOMS_PER_AGE'] = df['TOTAL_SYMPTOMS'] / (df['AGE'] + 1)

# Proportion de symptômes respiratoires
df['RESP_SYMPTOM_RATIO'] = df['RESPIRATORY_SYMPTOMS'] / (df['TOTAL_SYMPTOMS'] + 1)

quick_df_info(df)
df.head()


┌-------------------------------* Information *-------------------------------┐
├─------- Shape: (17940, 31) - Colonnes:
├─ID                        int64     
├─GENDER                    int64     
├─AGE                       float64   
├─LUNG_CANCER               int64     
├─SMOKING                   int64     
├─YELLOW_FINGERS            int64     
├─ANXIETY                   int64     
├─PEER_PRESSURE             int64     
├─CHRONIC DISEASE           int64     
├─FATIGUE                   int64     
├─ALLERGY                   int64     
├─WHEEZING                  int64     
├─ALCOHOL CONSUMING         int64     
├─COUGHING                  int64     
├─SHORTNESS OF BREATH       int64     
├─SWALLOWING DIFFICULTY     int64     
├─CHEST PAIN                int64     
├─SMOKING_x_AGE             float64   
├─SMOKING_x_ALCOHOL         bool      
├─RESPIRATORY_SYMPTOMS      int64     
├─TOTAL_SYMPTOMS            int64     
├─BEHAVIORAL_RISK_SCORE     int64     
├─SEVERE_SYMPTOMS   

Unnamed: 0,ID,GENDER,AGE,LUNG_CANCER,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,...,BEHAVIORAL_RISK_SCORE,SEVERE_SYMPTOMS,AGE_GROUP,HIGH_RISK_PROFILE,AGE_SQUARED,CANCER_TRIAD,SMOKER_WITH_RESP_SYMPTOMS,ADVANCED_SYMPTOMS,SYMPTOMS_PER_AGE,RESP_SYMPTOM_RATIO
0,1,1,69.0,1,1,0,0,1,0,1,...,3,1,60-70,True,4761.0,False,True,False,0.042857,0.5
1,2,1,71.0,1,1,1,0,0,1,0,...,1,2,70+,True,5041.0,False,True,False,0.069444,0.333333
2,3,1,61.0,0,1,0,0,1,1,0,...,2,3,60-70,True,3721.0,False,True,True,0.080645,0.333333
3,4,1,55.0,1,1,1,0,1,0,0,...,3,3,50-60,False,3025.0,False,True,True,0.089286,0.333333
4,5,0,56.0,1,1,0,0,0,0,1,...,2,3,50-60,False,3136.0,False,True,True,0.105263,0.285714


In [21]:
col_bool = ['SMOKING',
'YELLOW_FINGERS',
'ANXIETY',
'PEER_PRESSURE',
'CHRONIC DISEASE',
'WHEEZING',
'ALCOHOL CONSUMING',
'COUGHING',
'SHORTNESS OF BREATH',
'SWALLOWING DIFFICULTY',
'LUNG_CANCER','ALLERGY','FATIGUE','CHEST PAIN',]
for col in col_bool:
    if col in df.columns:
        df[col] = df[col].astype(bool)

df.to_parquet('../data/processed/survey_lung_cancer_features.parquet', engine='pyarrow', index=False)
df.head()

Unnamed: 0,ID,GENDER,AGE,LUNG_CANCER,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,...,BEHAVIORAL_RISK_SCORE,SEVERE_SYMPTOMS,AGE_GROUP,HIGH_RISK_PROFILE,AGE_SQUARED,CANCER_TRIAD,SMOKER_WITH_RESP_SYMPTOMS,ADVANCED_SYMPTOMS,SYMPTOMS_PER_AGE,RESP_SYMPTOM_RATIO
0,1,1,69.0,True,True,False,False,True,False,True,...,3,1,60-70,True,4761.0,False,True,False,0.042857,0.5
1,2,1,71.0,True,True,True,False,False,True,False,...,1,2,70+,True,5041.0,False,True,False,0.069444,0.333333
2,3,1,61.0,False,True,False,False,True,True,False,...,2,3,60-70,True,3721.0,False,True,True,0.080645,0.333333
3,4,1,55.0,True,True,True,False,True,False,False,...,3,3,50-60,False,3025.0,False,True,True,0.089286,0.333333
4,5,0,56.0,True,True,False,False,False,False,True,...,2,3,50-60,False,3136.0,False,True,True,0.105263,0.285714
