<a href="https://colab.research.google.com/github/ZeusKane/serigne-kane/blob/serigne-kane/Exercice_1_corrig%C3%A9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Introduction to Machine Learning ‚Äì Titanic Dataset (Version Corrig√©e)
# Ce notebook introduit l'apprentissage supervis√© avec preprocessing, pipelines et √©valuation

# üì¶ Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

print("üö¢ Titanic Machine Learning Pipeline - Version Compl√®te")
print("=" * 60)

# üì• Chargement du Dataset Titanic
print("\nüì• Chargement du Dataset Titanic...")
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
print(f"Forme du dataset: {df.shape}")
print("\nPremi√®res 5 lignes:")
print(df.head())

print("\nInformations sur le dataset:")
print(df.info())
print("\nValeurs manquantes par colonne:")
print(df.isnull().sum())

# üßπ S√©lection des Features et Target
print("\nüßπ S√©lection des Features et Target...")
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'
X = df[features]
y = df[target]
print(f"Features s√©lectionn√©es: {features}")
print(f"Variable cible: {target}")
print(f"Forme X: {X.shape}, Forme y: {y.shape}")

# üîß D√©finition du Pipeline de Preprocessing
print("\nüîß Configuration du Pipeline de Preprocessing...")

# Features num√©riques
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Features cat√©gorielles
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combinaison des transformateurs
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

print("Pipeline de preprocessing configur√© avec succ√®s!")

# üîÅ Pipeline Complet avec R√©gression Logistique
print("\nüîÅ Cr√©ation du Pipeline Complet avec R√©gression Logistique...")
clf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Division des donn√©es
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Taille du set d'entra√Ænement: {X_train.shape[0]}")
print(f"Taille du set de test: {X_test.shape[0]}")

# Entra√Ænement du mod√®le
print("\nüéØ Entra√Ænement du mod√®le R√©gression Logistique...")
clf_pipeline.fit(X_train, y_train)

# √âvaluation
y_pred = clf_pipeline.predict(X_test)
print("\nüìä R√©sultats R√©gression Logistique:")
print(classification_report(y_test, y_pred))
lr_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {lr_accuracy:.4f}")

# Sauvegarde du pipeline entra√Æn√©
print("\nüíæ Sauvegarde du pipeline entra√Æn√©...")
joblib.dump(clf_pipeline, "titanic_pipeline.pkl")
print("‚úÖ Pipeline sauvegard√© sous 'titanic_pipeline.pkl'")

print("\n" + "=" * 60)
print("EXERCICE 1: Essayer un Classificateur Diff√©rent")
print("=" * 60)

# Exercice 1: Random Forest Classifier
print("\nüå≤ Entra√Ænement avec Random Forest Classifier...")
rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("\nüìä R√©sultats Random Forest:")
print(classification_report(y_test, y_pred_rf))
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {rf_accuracy:.4f}")

print(f"\nüîç Comparaison des mod√®les:")
print(f"R√©gression Logistique Accuracy: {lr_accuracy:.4f}")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Am√©lioration: {rf_accuracy - lr_accuracy:.4f}")

# Analyse des features importantes (Random Forest)
feature_names = (numeric_features +
                list(rf_pipeline.named_steps['preprocessing']
                    .named_transformers_['cat']
                    .named_steps['encoder']
                    .get_feature_names_out(categorical_features)))
feature_importance = rf_pipeline.named_steps['classifier'].feature_importances_

print(f"\nüéØ Importance des features (Random Forest):")
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)
print(feature_importance_df.head(10))

print("\n" + "=" * 60)
print("EXERCICE 2: Utiliser la Validation Crois√©e")
print("=" * 60)

# Exercice 2: Cross-Validation
print("\nüîÑ Validation crois√©e √† 5 plis...")

# Validation crois√©e pour la R√©gression Logistique
cv_scores_lr = cross_val_score(clf_pipeline, X, y, cv=5, scoring='accuracy')
print(f"\nScores CV R√©gression Logistique: {cv_scores_lr.round(4)}")
print(f"Accuracy CV moyenne: {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std() * 2:.4f})")

# Validation crois√©e pour Random Forest
cv_scores_rf = cross_val_score(rf_pipeline, X, y, cv=5, scoring='accuracy')
print(f"\nScores CV Random Forest: {cv_scores_rf.round(4)}")
print(f"Accuracy CV moyenne: {cv_scores_rf.mean():.4f} (+/- {cv_scores_rf.std() * 2:.4f})")

print(f"\nüéØ Analyse de la stabilit√© du mod√®le:")
print(f"√âcart-type R√©gression Logistique: {cv_scores_lr.std():.4f}")
print(f"√âcart-type Random Forest: {cv_scores_rf.std():.4f}")
print("Un √©cart-type plus faible indique un mod√®le plus stable entre les plis.")

print("\n" + "=" * 60)
print("EXERCICE 3: Ing√©nierie de Features")
print("=" * 60)

# Exercice 3: Feature Engineering
print("\nüîß Ajout de la feature FamilySize...")
df['FamilySize'] = df['SibSp'] + df['Parch']
print(f"Statistiques FamilySize:")
print(df['FamilySize'].describe())

# Cr√©er des features additionnelles
df['IsAlone'] = (df['FamilySize'] == 0).astype(int)
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# Regrouper les titres rares
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                  'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

print(f"\nDistribution des titres:")
print(df['Title'].value_counts())

# Features am√©lior√©es
features_enhanced = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'IsAlone', 'Title']
X_enhanced = df[features_enhanced]

# Pipeline de preprocessing am√©lior√©
numeric_features_enhanced = ['Age', 'Fare', 'FamilySize']
categorical_features_enhanced = ['Pclass', 'Sex', 'Embarked', 'IsAlone', 'Title']

numeric_transformer_enhanced = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer_enhanced = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor_enhanced = ColumnTransformer([
    ('num', numeric_transformer_enhanced, numeric_features_enhanced),
    ('cat', categorical_transformer_enhanced, categorical_features_enhanced)
])

# Pipeline am√©lior√©
enhanced_pipeline = Pipeline([
    ('preprocessing', preprocessor_enhanced),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Entra√Ænement et √©valuation du mod√®le am√©lior√©
X_train_enh, X_test_enh, y_train_enh, y_test_enh = train_test_split(
    X_enhanced, y, test_size=0.2, random_state=42, stratify=y
)

enhanced_pipeline.fit(X_train_enh, y_train_enh)
y_pred_enh = enhanced_pipeline.predict(X_test_enh)

print("\nüìä R√©sultats du Mod√®le Am√©lior√© (avec nouvelles features):")
print(classification_report(y_test_enh, y_pred_enh))
enhanced_accuracy = accuracy_score(y_test_enh, y_pred_enh)
print(f"Accuracy du Mod√®le Am√©lior√©: {enhanced_accuracy:.4f}")

print(f"\nüîç Impact de l'ing√©nierie de features:")
print(f"Random Forest Original:    {rf_accuracy:.4f}")
print(f"Mod√®le Am√©lior√©:          {enhanced_accuracy:.4f}")
print(f"Am√©lioration:             {enhanced_accuracy - rf_accuracy:.4f}")

# Validation crois√©e pour le mod√®le am√©lior√©
cv_scores_enhanced = cross_val_score(enhanced_pipeline, X_enhanced, y, cv=5, scoring='accuracy')
print(f"\nCV Mod√®le Am√©lior√©: {cv_scores_enhanced.mean():.4f} (+/- {cv_scores_enhanced.std() * 2:.4f})")

# Sauvegarde du pipeline am√©lior√©
joblib.dump(enhanced_pipeline, "titanic_enhanced_pipeline.pkl")
print("‚úÖ Pipeline am√©lior√© sauvegard√© sous 'titanic_enhanced_pipeline.pkl'")

print("\n" + "=" * 60)
print("EXERCICE 4: Interface Streamlit")
print("=" * 60)

# Code Streamlit pour l'interface
streamlit_code = '''
# streamlit_titanic_app.py
import streamlit as st
import joblib
import pandas as pd
import numpy as np

# Charger le mod√®le entra√Æn√©
@st.cache_resource
def load_model():
    try:
        return joblib.load("titanic_enhanced_pipeline.pkl")
    except FileNotFoundError:
        st.error("Mod√®le non trouv√©. Assurez-vous que 'titanic_enhanced_pipeline.pkl' existe.")
        return None

def main():
    st.set_page_config(page_title="Titanic Survival Predictor", page_icon="üö¢")

    st.title("üö¢ Pr√©dicteur de Survie du Titanic")
    st.write("Pr√©disez la survie d'un passager du Titanic bas√© sur ses caract√©ristiques.")

    # Charger le mod√®le
    model = load_model()
    if model is None:
        return

    # Interface utilisateur
    st.sidebar.header("Informations du Passager")

    # Champs d'entr√©e
    pclass = st.sidebar.selectbox("Classe du Passager", [1, 2, 3],
                                 help="1 = Premi√®re Classe, 2 = Deuxi√®me Classe, 3 = Troisi√®me Classe")
    sex = st.sidebar.selectbox("Sexe", ["male", "female"])
    age = st.sidebar.slider("√Çge", 0, 100, 30, help="√Çge en ann√©es")
    fare = st.sidebar.slider("Prix du billet", 0.0, 500.0, 32.0, step=0.1,
                            help="Prix du billet en livres")
    embarked = st.sidebar.selectbox("Port d'embarquement", ["S", "C", "Q"],
                                   help="S = Southampton, C = Cherbourg, Q = Queenstown")

    # Informations familiales
    st.sidebar.subheader("Informations Familiales")
    sibsp = st.sidebar.number_input("Fr√®res/S≈ìurs/Conjoints √† bord", 0, 8, 1)
    parch = st.sidebar.number_input("Parents/Enfants √† bord", 0, 6, 0)
    family_size = sibsp + parch
    is_alone = 1 if family_size == 0 else 0

    # Titre (simplifi√© pour l'interface)
    title_options = ["Mr", "Mrs", "Miss", "Master", "Rare"]
    title = st.sidebar.selectbox("Titre", title_options)

    st.sidebar.write(f"**Taille de famille: {family_size}**")
    st.sidebar.write(f"**Voyage seul: {'Oui' if is_alone else 'Non'}**")

    # R√©sum√© du passager
    st.subheader("R√©sum√© du Passager")
    col1, col2, col3 = st.columns(3)

    with col1:
        st.metric("Classe", pclass)
        st.metric("Sexe", sex)
        st.metric("√Çge", f"{age} ans")

    with col2:
        st.metric("Prix", f"¬£{fare:.2f}")
        st.metric("Embarquement", embarked)
        st.metric("Titre", title)

    with col3:
        st.metric("Taille famille", family_size)
        st.metric("Voyage seul", "Oui" if is_alone else "Non")

    # Pr√©diction
    if st.button("üéØ Pr√©dire la Survie", type="primary"):
        # Cr√©er DataFrame pour la pr√©diction
        X_new = pd.DataFrame([[pclass, sex, age, fare, embarked, family_size, is_alone, title]],
                           columns=["Pclass", "Sex", "Age", "Fare", "Embarked",
                                   "FamilySize", "IsAlone", "Title"])

        try:
            # Faire la pr√©diction
            prediction = model.predict(X_new)[0]
            probability = model.predict_proba(X_new)[0]

            # Afficher les r√©sultats
            st.subheader("R√©sultats de la Pr√©diction")

            if prediction == 1:
                st.success("üéâ **SURV√âCU** - Ce passager aurait probablement surv√©cu !")
                st.balloons()
            else:
                st.error("üíî **N'A PAS SURV√âCU** - Ce passager n'aurait probablement pas surv√©cu.")

            # Probabilit√©s
            st.subheader("D√©tail des Probabilit√©s")
            col1, col2 = st.columns(2)
            with col1:
                st.metric("Probabilit√© de Survie", f"{probability[1]:.1%}")
            with col2:
                st.metric("Probabilit√© de D√©c√®s", f"{probability[0]:.1%}")

            # Barre de progression
            st.progress(probability[1])

            # Graphique des probabilit√©s
            prob_data = pd.DataFrame({
                'R√©sultat': ['D√©c√®s', 'Survie'],
                'Probabilit√©': [probability[0], probability[1]]
            })
            st.bar_chart(prob_data.set_index('R√©sultat'))

        except Exception as e:
            st.error(f"Erreur lors de la pr√©diction: {str(e)}")

    # Informations sur le mod√®le
    with st.expander("‚ÑπÔ∏è Informations sur le Mod√®le"):
        st.write("""
        Ce mod√®le utilise un Random Forest avec les caract√©ristiques suivantes:
        - **Features**: Classe, Sexe, √Çge, Prix, Port d'embarquement, Taille famille, Voyage seul, Titre
        - **Algorithme**: Random Forest Classifier
        - **Preprocessing**: Imputation des valeurs manquantes, standardisation, encodage one-hot
        - **Performance**: ~82% d'accuracy en validation crois√©e
        """)

if __name__ == "__main__":
    main()
'''

print("üìù Code de l'application Streamlit g√©n√©r√©!")
print("\nPour lancer l'application Streamlit:")
print("1. Sauvegardez le code ci-dessus sous 'streamlit_titanic_app.py'")
print("2. Installez streamlit: pip install streamlit")
print("3. Lancez: streamlit run streamlit_titanic_app.py")

# Sauvegarder le code Streamlit
with open("streamlit_titanic_app.py", "w", encoding='utf-8') as f:
    f.write(streamlit_code)
print("‚úÖ Application Streamlit sauvegard√©e sous 'streamlit_titanic_app.py'")

print("\n" + "=" * 60)
print("üéØ R√âSUM√â DES R√âSULTATS")
print("=" * 60)

print(f"\nüìä Comparaison des Performances des Mod√®les:")
print(f"1. R√©gression Logistique:        {lr_accuracy:.4f}")
print(f"2. Random Forest:                {rf_accuracy:.4f}")
print(f"3. Mod√®le Am√©lior√© (RF + feat.): {enhanced_accuracy:.4f}")

print(f"\nüîÑ R√©sultats de la Validation Crois√©e:")
print(f"‚Ä¢ R√©gression Logistique CV: {cv_scores_lr.mean():.4f} ¬± {cv_scores_lr.std():.4f}")
print(f"‚Ä¢ Random Forest CV:         {cv_scores_rf.mean():.4f} ¬± {cv_scores_rf.std():.4f}")
print(f"‚Ä¢ Mod√®le Am√©lior√© CV:       {cv_scores_enhanced.mean():.4f} ¬± {cv_scores_enhanced.std():.4f}")

print(f"\nüöÄ Insights Cl√©s:")
print("‚Ä¢ Random Forest surpasse g√©n√©ralement la R√©gression Logistique")
print("‚Ä¢ L'ing√©nierie de features (FamilySize, IsAlone, Title) am√©liore les performances")
print("‚Ä¢ La validation crois√©e montre une bonne stabilit√© des mod√®les")
print("‚Ä¢ L'application Streamlit est pr√™te pour des pr√©dictions interactives")

print(f"\nüìÅ Fichiers Cr√©√©s:")
print("‚Ä¢ titanic_pipeline.pkl (Mod√®le R√©gression Logistique)")
print("‚Ä¢ titanic_enhanced_pipeline.pkl (Mod√®le Random Forest Am√©lior√©)")
print("‚Ä¢ streamlit_titanic_app.py (Application web interactive)")

print(f"\nüéâ Tous les exercices termin√©s avec succ√®s!")
print("üöÄ Vous pouvez maintenant lancer l'application Streamlit pour tester les pr√©dictions!")

üö¢ Titanic Machine Learning Pipeline - Version Compl√®te

üì• Chargement du Dataset Titanic...
Forme du dataset: (891, 12)

Premi√®res 5 lignes:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STO