# Titanic - Data Cleaning and Feature Engineering
## Objective
In this section, we will clean the dataset by handling missing values,
convert categorical variables, and create new features to improve model performance.

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print('Train dataset information before cleaning:')
print(train_df.info())
print('\nTest dataset information before cleaning:')
print(test_df.info())

## Handling Missing Values
We will fill missing values using the median for numerical columns and mode for categorical ones.

In [None]:
imputer_median = SimpleImputer(strategy='median')
train_df['Age'] = imputer_median.fit_transform(train_df[['Age']])
test_df['Age'] = imputer_median.transform(test_df[['Age']])

test_df['Fare'] = imputer_median.fit_transform(test_df[['Fare']])

imputer_mode = SimpleImputer(strategy='most_frequent')
train_df['Embarked'] = imputer_mode.fit_transform(train_df[['Embarked']])
test_df['Embarked'] = imputer_mode.transform(test_df[['Embarked']])

## Feature Engineering
Creating new meaningful features from existing data.

In [None]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

train_df['IsAlone'] = 1
train_df.loc[train_df['FamilySize'] > 1, 'IsAlone'] = 0
test_df['IsAlone'] = 1
test_df.loc[test_df['FamilySize'] > 1, 'IsAlone'] = 0

train_df['Title'] = train_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
test_df['Title'] = test_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

## Encoding Categorical Variables
Converting categorical features into numerical format using label encoding and one-hot encoding.

In [None]:
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
test_df['Sex'] = le.transform(test_df['Sex'])

train_df = pd.get_dummies(train_df, columns=['Embarked', 'Title'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Embarked', 'Title'], drop_first=True)

## Scaling Numerical Features
Scaling features to bring them into a uniform range.

In [None]:
scaler = StandardScaler()
train_df[['Age', 'Fare', 'FamilySize']] = scaler.fit_transform(train_df[['Age', 'Fare', 'FamilySize']])
test_df[['Age', 'Fare', 'FamilySize']] = scaler.transform(test_df[['Age', 'Fare', 'FamilySize']])

## Removing Unnecessary Columns
Dropping irrelevant features that are not needed for model training.

In [None]:
train_df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'], inplace=True)
test_df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'], inplace=True)

print('Train dataset after cleaning:', train_df.head())
print('Test dataset after cleaning:', test_df.head())

## Saving Cleaned Data
We will now save the cleaned datasets for further use.

In [None]:
train_df.to_csv('train_cleaned.csv', index=False)
test_df.to_csv('test_cleaned.csv', index=False)
print('Cleaned datasets saved successfully!')