# Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split


# Step 2: Load the Titanic Dataset
# Download Titanic dataset from https://www.kaggle.com/c/titanic/data and load it

or use github to download the data
https://github.com/datasciencedojo/datasets/blob/master/titanic.csv


In [2]:
data = pd.read_csv('titanic.csv')

In [3]:
print("Dataset Overview:")
print(data.info())
print(data.head())

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

    

# Step 3: Handle Missing Values
#Missing 'Age' values - fill with median

In [4]:
age_imputer = SimpleImputer(strategy='median')
data['Age'] = age_imputer.fit_transform(data[['Age']])

# Missing 'Embarked' values -
fill with the most frequent value

In [5]:
embarked_imputer = SimpleImputer(strategy='most_frequent')
data['Embarked'] = embarked_imputer.fit_transform(data[['Embarked']])


# Drop the 'Cabin' column as it has too many missing values


In [6]:
data = data.drop(columns=['Cabin'])


# Step 4: Encode Categorical Features
# One-hot encoding for 'Embarked'

In [7]:
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)


# Label encoding for 'Sex' (Male = 1, Female = 0)


In [8]:
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])


# Step 5: Feature Scaling
# Normalize numerical features using StandardScaler

In [9]:
scaler = StandardScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])


# Step 6: Drop Unnecessary Columns
# Drop irrelevant or ID columns (e.g., 'PassengerId', 'Name', 'Ticket')

In [10]:
data = data.drop(columns=['PassengerId', 'Name', 'Ticket'])


# Step 7: Split the Dataset
# Separate features and target

In [11]:
X = data.drop(columns=['Survived'])  # Features
y = data['Survived']  # Target

# Split the dataset into training and testing sets


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
print("\nShapes after splitting:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")



Shapes after splitting:
X_train: (712, 8), X_test: (179, 8), y_train: (712,), y_test: (179,)


# Step 8: Save the Preprocessed Dataset


In [14]:
X_train.to_csv('X_train_preprocessed.csv', index=False)
X_test.to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train_preprocessed.csv', index=False)
y_test.to_csv('y_test_preprocessed.csv', index=False)

print("\nPreprocessing Complete. Preprocessed data saved.")


Preprocessing Complete. Preprocessed data saved.
