# ⚙️ Initiez-vous au MLOps (partie 1/2)

## 📐Features engineering
### 🛠️ Préparez l'environnement de travail
#### 📦 Import des modules python

In [3]:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

import pandas as pd
import numpy as np
from src.visualization.visu_text import print_title, print_end, print_col, quick_df_info

df = pd.read_csv("../data/processed/survey_lung_cancer_clean.csv")


🎯 Features prioritaires (top 10)

SMOKING_x_AGE - Risque augmente exponentiellement avec l'âge
RESPIRATORY_SYMPTOMS - Agrégation des 3 symptômes clés
TOTAL_SYMPTOMS - Vue d'ensemble de la santé
HIGH_RISK_PROFILE - Profil démographique à risque
CANCER_TRIAD - Les 3 symptômes classiques ensemble
SMOKER_WITH_RESP_SYMPTOMS - Interaction critique
SMOKING_x_ALCOHOL - Facteurs de risque combinés
SEVERE_SYMPTOMS - Indicateur de gravité
AGE_SQUARED - Capture la non-linéarité
BEHAVIORAL_RISK_SCORE - Facteurs comportementaux

In [5]:
# Age
df['SMOKING_x_AGE'] = df['SMOKING'] * df['AGE']

# Combinaison tabac + alcool
df['SMOKING_x_ALCOHOL'] = df['SMOKING'] * df["ALCOHOL CONSUMING"]

# Symptômes respiratoires combinés
df['RESPIRATORY_SYMPTOMS'] = (df['WHEEZING'] + df['COUGHING'] + 
                               df['SHORTNESS OF BREATH']).clip(0, 3)

# Score de symptômes totaux
df['TOTAL_SYMPTOMS'] = (df['YELLOW_FINGERS'] + df['ANXIETY'] + 
                        df['FATIGUE '] + df['ALLERGY '] + df['WHEEZING'] + 
                        df['COUGHING'] + df['SHORTNESS OF BREATH'] + 
                        df['SWALLOWING DIFFICULTY'] + df['CHEST PAIN'])

# Score de facteurs de risque comportementaux
df['BEHAVIORAL_RISK_SCORE'] = (df['SMOKING'] + df['ALCOHOL CONSUMING'] + 
                                df['PEER_PRESSURE'])

# Score de symptômes graves
df['SEVERE_SYMPTOMS'] = (df['CHEST PAIN'] + df['SWALLOWING DIFFICULTY'] + 
                          df['SHORTNESS OF BREATH'])

# Catégories d'âge
df['AGE_GROUP'] = pd.cut(df['AGE'], bins=[0, 50, 60, 70, 100], 
                         labels=['<50', '50-60', '60-70', '70+'])

# Risque élevé : homme + fumeur + âge > 60
df['HIGH_RISK_PROFILE'] = ((df['GENDER'] == 1) & 
                            (df['SMOKING'] == 1) & 
                            (df['AGE'] > 60)).astype(int)

# Âge au carré (relation non-linéaire)
df['AGE_SQUARED'] = df['AGE'] ** 2

# Triade classique du cancer du poumon
df['CANCER_TRIAD'] = ((df['COUGHING'] == 1) & 
                      (df['CHEST PAIN'] == 1) & 
                      (df['SHORTNESS OF BREATH'] == 1)).astype(int)

# Fumeur avec symptômes respiratoires
df['SMOKER_WITH_RESP_SYMPTOMS'] = (df['SMOKING'] * df['RESPIRATORY_SYMPTOMS'])

# Symptômes avancés (dysphagie + douleur thoracique)
df['ADVANCED_SYMPTOMS'] = (df['SWALLOWING DIFFICULTY'] * df['CHEST PAIN'])

# Ratio symptômes / âge (normalisation)
df['SYMPTOMS_PER_AGE'] = df['TOTAL_SYMPTOMS'] / (df['AGE'] + 1)

# Proportion de symptômes respiratoires
df['RESP_SYMPTOM_RATIO'] = df['RESPIRATORY_SYMPTOMS'] / (df['TOTAL_SYMPTOMS'] + 1)

df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,...,BEHAVIORAL_RISK_SCORE,SEVERE_SYMPTOMS,AGE_GROUP,HIGH_RISK_PROFILE,AGE_SQUARED,CANCER_TRIAD,SMOKER_WITH_RESP_SYMPTOMS,ADVANCED_SYMPTOMS,SYMPTOMS_PER_AGE,RESP_SYMPTOM_RATIO
0,1,69,0,1,1,0,0,1,0,1,...,1,3,60-70,0,4761,1,0,1,0.114286,0.333333
1,1,74,1,0,0,0,1,1,1,0,...,1,3,70+,1,5476,0,1,1,0.066667,0.166667
2,0,59,0,0,0,1,0,1,0,1,...,1,2,50-60,0,3481,1,0,0,0.083333,0.5
3,1,63,1,1,1,0,0,0,0,0,...,2,2,60-70,1,3969,0,0,1,0.0625,0.0
4,0,63,0,1,0,0,0,0,0,1,...,0,1,60-70,0,3969,0,0,0,0.0625,0.6
