# Day6 — Feature selection & Train/Test split
Date: 11/10/2025
Goal: Select features, encode categorical variables, and split into train/test sets.

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../data/titanic_train.csv')
print(f"df's shape: {df.shape}\n\n")
print(f"Columns:\n {df.columns}")
df.head()

df's shape: (891, 12)


Columns:
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
def preprocess_basic(df):
    df = df.copy()
    # Set 'Title'
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.')  # Mr, Mrs extraction
    df['Title'] = df['Title'].str.strip()
   
    # Age: fille NaN with median values of groups
    df['Age'] = df.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
    # Embarked: fill with values from mode()
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # FamilySize
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    return df

df2 = preprocess_basic(df)

features = ['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'Title', 'Embarked']
X = df2[features].copy()
y = df2['Survived']

In [21]:
# ?Sex, Embarked, Title -> one-hot.
X = pd.get_dummies(X, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

In [23]:
X_train, X_test, y_train, t_test = train_test_split(X, y, test_size=0.2, random_state=12)

print(X_train.shape)
print(X_test.shape)

(712, 23)
(179, 23)


In [30]:
num_cols = ['Age', 'Fare', 'FamilySize']
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
X_train[num_cols]

Unnamed: 0,Age,Fare,FamilySize
8,-0.153550,-0.416075,0.728468
150,1.699116,-0.388502,-0.536579
221,-0.153550,-0.379091,-0.536579
365,0.078034,-0.493013,-0.536579
324,-0.307938,0.741305,5.788655
...,...,...,...
241,-0.578119,-0.329560,0.095945
253,0.078034,-0.317672,0.095945
390,0.541200,1.740846,1.360991
667,-0.307938,-0.482611,-0.536579
