In [1]:
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('train.csv')

# Display the first few rows
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# View the updated dataframe
df.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [3]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(columns=['Survived'])
y = df['Survived']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Display the shape of train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# View a sample of X_train
X_train.head()


X_train shape: (712, 7)
X_test shape: (179, 7)
y_train shape: (712,)
y_test shape: (179,)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Create transformer for missing values
trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), ['Age']),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), ['Embarked'])
], remainder='passthrough')

# Fit and transform on X_train to test
X_train_trf1 = trf1.fit_transform(X_train)

# Show result
print("Imputation step completed. Sample data (first 5 rows):")
print(pd.DataFrame(X_train_trf1).head())


Imputation step completed. Sample data (first 5 rows):
      0  1  2       3  4  5       6
0  45.5  S  1    male  0  0    28.5
1  23.0  S  2    male  0  0    13.0
2  32.0  S  3    male  0  0   7.925
3  26.0  S  3    male  1  0  7.8542
4   6.0  S  3  female  4  2  31.275


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Create a pipeline chaining imputation and encoder
impute_encode_pipeline = Pipeline([
    ('imputer', trf1),
    ('ohe', ColumnTransformer([
        ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6])
    ], remainder='passthrough'))
])

# Fit and transform on X_train
X_train_trf2 = impute_encode_pipeline.fit_transform(X_train)

# Show result
print("One-Hot Encoding step completed. Sample data (first 5 rows):")
print(pd.DataFrame(X_train_trf2).head())


One-Hot Encoding step completed. Sample data (first 5 rows):
   0    1    2    3    4    5    6    7    8    9    ...  218  219  220  221  \
0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   222   223 224     225 226 227  
0  0.0  45.5   1    male   0   0  
1  0.0  23.0   2    male   0   0  
2  0.0  32.0   3    male   0   0  
3  0.0  26.0   3    male   1   0  
4  0.0   6.0   3  female   4   2  

[5 rows x 228 columns]


In [10]:
print(X_train.columns)


Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


In [13]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

numeric_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Sex', 'Embarked']

# Imputers for missing values
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Numeric pipeline: impute then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', numeric_imputer),
    ('scaler', MinMaxScaler())
])

# Categorical pipeline: impute then encode
categorical_transformer = Pipeline(steps=[
    ('imputer', categorical_imputer),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Now fit the pipeline (missing values handled)
pipe.fit(X_train, y_train)

# Predict on test data
y_pred = pipe.predict(X_test)


In [14]:
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1])