In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# ============================================================
# 1. CHARGEMENT DES DONNÃ‰ES
# ============================================================
DATA_PATH = "cancer-risk-factors.csv"

df = pd.read_csv(DATA_PATH)

print("âœ… DonnÃ©es chargÃ©es :", df.shape)

# ============================================================
# 2. FEATURES & TARGET
# ============================================================
TARGET = "Cancer_Type"

FEATURES = [
    'Age', 'Gender', 'Smoking', 'Alcohol_Use', 'Obesity',
    'Family_History', 'Diet_Red_Meat', 'Diet_Salted_Processed',
    'Fruit_Veg_Intake', 'Physical_Activity', 'Air_Pollution',
    'Occupational_Hazards', 'BRCA_Mutation', 'H_Pylori_Infection',
    'Calcium_Intake', 'Overall_Risk_Score', 'BMI',
    'Physical_Activity_Level'
]

X = df[FEATURES]
y = df[TARGET]

# ============================================================
# 3. ENCODAGE DU TYPE DE CANCER
# ============================================================
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("ðŸ§¬ Types de cancer dÃ©tectÃ©s :", list(label_encoder.classes_))

# ============================================================
# 4. SPLIT TRAIN / TEST
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ============================================================
# 5. MODÃˆLE â€“ RANDOM FOREST
# ============================================================
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)

# ============================================================
# 6. Ã‰VALUATION
# ============================================================
y_pred = model.predict(X_test)

print("\nðŸ“Š Accuracy :", accuracy_score(y_test, y_pred))
print("\nðŸ“Š Rapport de classification :\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# ============================================================
# 7. SAUVEGARDE DU MODÃˆLE & ENCODEUR
# ============================================================
joblib.dump(model, "model_cancer_type.pkl")
joblib.dump(label_encoder, "cancer_type_encoder.pkl")

print("\nðŸ’¾ ModÃ¨le sauvegardÃ© : model_cancer_type.pkl")
print("ðŸ’¾ Encodeur sauvegardÃ© : cancer_type_encoder.pkl")


âœ… DonnÃ©es chargÃ©es : (2000, 21)
ðŸ§¬ Types de cancer dÃ©tectÃ©s : ['Breast', 'Colon', 'Lung', 'Prostate', 'Skin']

ðŸ“Š Accuracy : 0.7675

ðŸ“Š Rapport de classification :

              precision    recall  f1-score   support

      Breast       0.84      0.78      0.81        92
       Colon       0.79      0.76      0.78        84
        Lung       0.79      0.90      0.84       105
    Prostate       0.73      0.75      0.74        61
        Skin       0.61      0.53      0.57        58

    accuracy                           0.77       400
   macro avg       0.75      0.75      0.75       400
weighted avg       0.77      0.77      0.76       400


ðŸ’¾ ModÃ¨le sauvegardÃ© : model_cancer_type.pkl
ðŸ’¾ Encodeur sauvegardÃ© : cancer_type_encoder.pkl
