In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('titanic.csv')

# Display the first few rows of the dataframe
print(df.head())

# Display the summary of the dataframe
print(df.info())

# Feature Engineering and Data Preprocessing

# 1. Handling missing values
# - Age: Fill missing values with the median age
# - Cabin: Fill missing values with 'Unknown'
# - Embarked: Fill missing values with the mode

imputer_age = SimpleImputer(strategy='median')
df['Age'] = imputer_age.fit_transform(df[['Age']])

df['Cabin'].fillna('Unknown', inplace=True)

imputer_embarked = SimpleImputer(strategy='most_frequent')
df['Embarked'] = imputer_embarked.fit_transform(df[['Embarked']]).ravel()

# 2. Feature Extraction
# - Extract titles from the 'Name' column
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# - Extract deck information from the 'Cabin' column
df['Deck'] = df['Cabin'].str[0]

# 3. Encoding categorical variables
# - Sex, Embarked, Title, Deck

label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])
df['Title'] = label_encoder.fit_transform(df['Title'])
df['Deck'] = label_encoder.fit_transform(df['Deck'])

# 4. Dropping unnecessary columns
# - Name, Ticket, Cabin

df.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

# 5. Scaling numerical features
# - Age, Fare

scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

# Display the first few rows of the processed dataframe
print(df.head())

# Save the processed dataframe to a new CSV file
df.to_csv('titanic_processed.csv', index=False)

print("Feature engineering and data preprocessing completed. Processed data saved to 'titanic_processed.csv'.")