In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.impute import SimpleImputer

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import re

In [8]:
# 1. Création d'un DataFrame Titanic-like avec plus de passagers
data = {
    'PassengerId': list(range(1, 11)),
    'Survived': [0, 1, 1, 1, 0, 0, 1, 0, 1, 0],
    'Pclass': [3, 1, 3, 1, 3, 2, 2, 3, 1, 2],
    'Name': [
        'Braund, Mr. Owen Harris',
        'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
        'Heikkinen, Miss. Laina',
        'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
        'Allen, Mr. William Henry',
        'Moran, Mr. James',
        'McCarthy, Mr. Timothy J',
        'Palsson, Master. Gosta Leonard',
        'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
        'Nasser, Mrs. Nicholas (Adele Achem)'
    ],
    'Sex': ['male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female'],
    'Age': [22, 38, 26, 35, None, 27, 54, 2, 27, 14],
    'SibSp': [1, 1, 0, 1, 0, 0, 0, 3, 0, 1],
    'Parch': [0, 0, 0, 0, 0, 0, 0, 1, 2, 0],
    'Ticket': [
        'A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450',
        '330877', '17463', '349909', '347742', '237736'
    ],
    'Fare': [7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 51.8625, 21.075, 11.1333, 30.0708],
    'Cabin': [None, 'C85', None, 'C123', None, None, 'E46', None, None, None],
    'Embarked': ['S', 'C', 'S', 'S', None, 'Q', 'S', 'S', 'S', 'C']

}

In [9]:
titanic_df = pd.DataFrame(data)
titanic_df.head()  # Affiche les 5 premières lignes du DataFrame initial

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,,0,0,373450,8.05,,


In [10]:
# 2. Suppression des lignes où 'Cabin' est manquant
titanic_df = titanic_df.dropna(subset=['Cabin'])
titanic_df.head()  # Affiche les 5 premières lignes après suppression


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,1,2,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [11]:
# 3. Imputation de l'âge manquant par la moyenne
imputer = SimpleImputer(strategy='mean')
titanic_df['Age'] = imputer.fit_transform(titanic_df[['Age']])
titanic_df.head()  # Affiche les 5 premières lignes après imputation


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,1,2,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [12]:
# 4. Remplissage des valeurs manquantes d'Embarked par 'Unknown'
titanic_df['Embarked'] = titanic_df['Embarked'].fillna('Unknown')
titanic_df.head()  # Affiche les 5 premières lignes après remplissage


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,1,2,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [13]:
# 5. Création de la variable FamilySize
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1
titanic_df.head()  # Affiche les 5 premières lignes après ajout de FamilySize


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
6,7,1,2,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1


In [14]:
# 6. Extraction du titre depuis le nom
titanic_df['Title'] = titanic_df['Name'].apply(
    lambda name: re.search(r',\s*([^\.]*)\.', name).group(1).strip() if re.search(r',\s*([^\.]*)\.', name) else 'Unknown'
)
titanic_df.head()  # Affiche les 5 premières lignes après extraction du titre


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Title
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,Mrs
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,Mrs
6,7,1,2,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,Mr


In [15]:
# 7. Encodage one-hot des variables catégorielles (Sex, Embarked, Title)
titanic_df = pd.get_dummies(titanic_df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)
titanic_df.h

AttributeError: 'DataFrame' object has no attribute 'h'

In [16]:
# 8. Normalisation des variables numériques (Age, Fare, FamilySize)
scaler = StandardScaler()
titanic_df[['Age', 'Fare', 'FamilySize']] = scaler.fit_transform(titanic_df[['Age', 'Fare', 'FamilySize']])
titanic_df.head()  # Affiche les 5 premières lignes après normalisation

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,FamilySize,Sex_male,Embarked_S,Title_Mrs
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",-0.519584,1,0,PC 17599,1.411922,C85,0.707107,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",-0.879297,1,0,113803,-0.636264,C123,0.707107,False,True,True
6,7,1,2,"McCarthy, Mr. Timothy J",1.398881,0,0,17463,-0.775658,E46,-1.414214,True,True,False
