In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [56]:
# Load the dataset
df = pd.read_csv("C:\\Users\\asaha\\Downloads\\titanic.csv")

In [57]:
# Display the first few rows of the dataframe
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


# DATA PREPROCESSING 

In [58]:
# Drop irrelevant columns
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [69]:
# Handle missing values
# For 'Age', we'll use the mean of the column to fill missing values
mean_age = df['Age'].mean()
df.fillna({'Age': mean_age}, inplace=True)

In [70]:
# Debugging: Check for missing values before imputation
print("Missing values before filling (Embarked):")
print(df['Embarked'].isnull().sum())

Missing values before filling (Embarked):
2


In [72]:
# For 'Embarked', we'll use the most frequent value to fill missing values manually
most_frequent_embarked = df['Embarked'].mode().iloc[0]
df.fillna({'Embarked': most_frequent_embarked}, inplace=True)

In [73]:
# Debugging: Check for missing values after imputation
print("Missing values after filling (Embarked):")
print(df['Embarked'].isnull().sum())

Missing values after filling (Embarked):
0


In [74]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])

# FEATURE ENGINEERING

In [75]:
# Create new features
# Family Size
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [76]:
# Is Alone (binary feature indicating if the passenger was alone)
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

In [77]:
# Age Groups
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, 80], labels=['Child', 'Teenager', 'Adult', 'Middle-aged', 'Senior'])
df['AgeGroup'] = label_encoder.fit_transform(df['AgeGroup'].astype(str))

In [78]:
# Fare Bins
df['FareBin'] = pd.qcut(df['Fare'], 4, labels=[1, 2, 3, 4])

In [79]:
# Drop columns that have been transformed into new features
df.drop(['SibSp', 'Parch', 'Age', 'Fare'], axis=1, inplace=True)

# FINAL STEPS

In [80]:
# Separate features and target variable
X = df.drop('Survived', axis=1)
y = df['Survived']

In [81]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [83]:
# Display the first few rows of the preprocessed training data
print(pd.DataFrame(X_train).head())

          0         1         2         3         4         5         6
0 -1.614136  0.724310  0.563525 -0.554666  0.812203  0.783679  0.455844
1 -0.400551  0.724310  0.563525 -0.554666  0.812203 -0.661864 -0.445714
2  0.813034  0.724310  0.563525 -0.554666  0.812203 -0.661864 -0.445714
3  0.813034  0.724310  0.563525  0.040096 -1.231219 -0.661864 -1.347272
4  0.813034 -1.380624  0.563525  3.013909 -1.231219  0.060908  1.357401


#### At this point, X_train and X_test are ready to be used in machine learning models