In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant

In [2]:
# load dataset
df = pd.read_csv("../data/titanic_dataset.csv")

In [3]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
# keep required columns
req_cols = ['Name','Sex', 'Age','SibSp', 'Parch', 'Cabin', 'Fare', 'Embarked', 'Survived'] 
df=df[req_cols]
df

Unnamed: 0,Name,Sex,Age,SibSp,Parch,Cabin,Fare,Embarked,Survived
0,"Braund, Mr. Owen Harris",male,22.0,1,0,,7.2500,S,0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C85,71.2833,C,1
2,"Heikkinen, Miss. Laina",female,26.0,0,0,,7.9250,S,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,C123,53.1000,S,1
4,"Allen, Mr. William Henry",male,35.0,0,0,,8.0500,S,0
...,...,...,...,...,...,...,...,...,...
865,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,,13.0000,S,1
866,"Duran y More, Miss. Asuncion",female,27.0,1,0,,13.8583,C,1
867,"Roebling, Mr. Washington Augustus II",male,31.0,0,0,A24,50.4958,S,0
868,"van Melkebeke, Mr. Philemon",male,,0,0,,9.5000,S,0


In [5]:
# transform names to titles
df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
df.insert(df.columns.get_loc('Name'), 'Title', df.pop('Title'))
df = df.drop(columns=['Name'])
df


Unnamed: 0,Title,Sex,Age,SibSp,Parch,Cabin,Fare,Embarked,Survived
0,Mr,male,22.0,1,0,,7.2500,S,0
1,Mrs,female,38.0,1,0,C85,71.2833,C,1
2,Miss,female,26.0,0,0,,7.9250,S,1
3,Mrs,female,35.0,1,0,C123,53.1000,S,1
4,Mr,male,35.0,0,0,,8.0500,S,0
...,...,...,...,...,...,...,...,...,...
865,Mrs,female,42.0,0,0,,13.0000,S,1
866,Miss,female,27.0,1,0,,13.8583,C,1
867,Mr,male,31.0,0,0,A24,50.4958,S,0
868,Mr,male,,0,0,,9.5000,S,0


In [6]:
# handling Embarked missing values
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Embarked'].isna().sum()

np.int64(0)

In [7]:
# handling Fare missing values
df.loc[df['Fare']==0, 'Fare'] = np.nan
# impute missing fares with median
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Fare'].isnull().sum()


np.int64(0)

In [8]:
# impute missing ages with median
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Age'].isnull().sum()

np.int64(0)

In [9]:
# transform cabin to deck letter
df['Deck'] = df['Cabin'].astype(str).str[0].replace('n', 'U')
df.insert(df.columns.get_loc('Cabin'), 'Deck', df.pop('Deck'))
df = df.drop(columns=['Cabin'])
df


Unnamed: 0,Title,Sex,Age,SibSp,Parch,Deck,Fare,Embarked,Survived
0,Mr,male,22.0,1,0,U,7.2500,S,0
1,Mrs,female,38.0,1,0,C,71.2833,C,1
2,Miss,female,26.0,0,0,U,7.9250,S,1
3,Mrs,female,35.0,1,0,C,53.1000,S,1
4,Mr,male,35.0,0,0,U,8.0500,S,0
...,...,...,...,...,...,...,...,...,...
865,Mrs,female,42.0,0,0,U,13.0000,S,1
866,Miss,female,27.0,1,0,U,13.8583,C,1
867,Mr,male,31.0,0,0,A,50.4958,S,0
868,Mr,male,28.0,0,0,U,9.5000,S,0


In [10]:
df['Deck'].value_counts(dropna=False)

Deck
U    671
C     57
B     45
D     32
E     32
A     15
F     13
G      4
T      1
Name: count, dtype: int64

In [11]:
# check distinct values for categorical columns
cat_cols = ['Embarked', 'Deck', 'Sex', 'Title']
unique_values = {col: df[col].unique().tolist() for col in cat_cols}
pretty_json = json.dumps(unique_values, indent=2)
print(pretty_json)

{
  "Embarked": [
    "S",
    "C",
    "Q"
  ],
  "Deck": [
    "U",
    "C",
    "E",
    "G",
    "D",
    "A",
    "B",
    "F",
    "T"
  ],
  "Sex": [
    "male",
    "female"
  ],
  "Title": [
    "Mr",
    "Mrs",
    "Miss",
    "Master",
    "Don",
    "Rev",
    "Dr",
    "Mme",
    "Ms",
    "Major",
    "Lady",
    "Sir",
    "Mlle",
    "Col",
    "Capt",
    "Countess",
    "Jonkheer"
  ]
}


In [12]:
# log transform Fare
df['Fare'] = np.log1p(df['Fare'])

In [13]:
# for decision trees

# new feature
df['IsAlone'] = ((df['SibSp'] + df['Parch'] + 1) == 1).astype(int)
df.insert(df.columns.get_loc('Age'), 'IsAlone', df.pop('IsAlone'))

cat_cols = ['Embarked', 'Deck', 'Sex', 'Title']

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

df.head()


Unnamed: 0,Title,Sex,IsAlone,Age,SibSp,Parch,Deck,Fare,Embarked,Survived
0,12,1,0,22.0,1,0,8,2.110213,2,0
1,13,0,0,38.0,1,0,2,4.280593,0,1
2,9,0,1,26.0,0,0,8,2.188856,2,1
3,13,0,0,35.0,1,0,2,3.990834,2,1
4,12,1,1,35.0,0,0,8,2.202765,2,0


In [13]:
# target
target = df['Survived']
df = df.drop(columns=['Survived'])

# new Features
df['IsAlone'] = ((df['SibSp'] + df['Parch'] + 1) == 1).astype(int)
df.insert(df.columns.get_loc('Age'), 'IsAlone', df.pop('IsAlone'))

cat_cols = ['Embarked', 'Deck', 'Sex', 'Title']

# dummy variables
df_dummies = pd.get_dummies(df[cat_cols], drop_first=True)

# original features + new
original_features = df.drop(columns=cat_cols)

df = pd.concat([df_dummies, original_features, target], axis=1)
df

Unnamed: 0,Embarked_Q,Embarked_S,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U,...,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,IsAlone,Age,SibSp,Parch,Fare,Survived
0,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,0,22.0,1,0,2.110213,0
1,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,0,38.0,1,0,4.280593,1
2,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,1,26.0,0,0,2.188856,1
3,False,True,False,True,False,False,False,False,False,False,...,True,False,False,False,0,35.0,1,0,3.990834,1
4,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,1,35.0,0,0,2.202765,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,False,True,False,False,False,False,False,False,False,True,...,True,False,False,False,1,42.0,0,0,2.639057,1
866,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,0,27.0,1,0,2.698559,1
867,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,1,31.0,0,0,3.941500,0
868,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,1,28.0,0,0,2.351375,0


In [14]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [15]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, shuffle=True)


In [16]:
X_train

Unnamed: 0,Title,Sex,IsAlone,Age,SibSp,Parch,Deck,Fare,Embarked
762,12,1,1,20.0,0,0,8,2.107689,0
670,13,0,0,40.0,1,1,8,3.688879,2
457,13,0,0,28.0,1,0,3,3.967694,2
522,12,1,1,28.0,0,0,8,2.107178,0
743,12,1,0,24.0,1,0,8,2.839078,2
...,...,...,...,...,...,...,...,...,...
106,9,0,1,21.0,0,0,8,2.157559,2
270,12,1,1,28.0,0,0,8,3.465736,2
860,12,1,0,41.0,2,0,8,2.715244,2
435,9,0,0,14.0,1,2,1,4.795791,2


In [17]:
# save dataset without scaling for decision tree
X_train = X_train.drop(columns=['Parch', 'SibSp'])
X_test = X_test.drop(columns=['Parch', 'SibSp'])

np.savez(
    "../data/tree_processed_data.npz",
    X_train=X_train.to_numpy(dtype=np.float64),
    X_test=X_test.to_numpy(dtype=np.float64),
    y_train=y_train.to_numpy(dtype=np.float64),
    y_test=y_test.to_numpy(dtype=np.float64),
    feature_names=X_train.columns.to_numpy()
)

In [17]:
# scale age and fare
scaler = StandardScaler()
X_train[['Fare', 'Age']] = scaler.fit_transform(X_train[['Fare', 'Age']])
X_test[['Fare', 'Age']] = scaler.transform(X_test[['Fare', 'Age']])

In [18]:
X_train

Unnamed: 0,Embarked_Q,Embarked_S,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U,...,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,IsAlone,Age,SibSp,Parch,Fare
762,False,False,False,False,False,False,False,False,False,True,...,True,False,False,False,False,1,-0.714269,0,0,-1.017865
670,False,True,False,False,False,False,False,False,False,True,...,False,True,False,False,False,0,0.806999,1,1,0.741386
457,False,True,False,False,True,False,False,False,False,False,...,False,True,False,False,False,0,-0.105762,1,0,1.051599
522,False,False,False,False,False,False,False,False,False,True,...,True,False,False,False,False,1,-0.105762,0,0,-1.018433
743,False,True,False,False,False,False,False,False,False,True,...,True,False,False,False,False,0,-0.410016,1,0,-0.204112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,1,-0.638206,0,0,-0.962378
270,False,True,False,False,False,False,False,False,False,True,...,True,False,False,False,False,1,-0.105762,0,0,0.493114
860,False,True,False,False,False,False,False,False,False,True,...,True,False,False,False,False,0,0.883062,2,0,-0.341891
435,False,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,0,-1.170649,1,2,1.972948


In [19]:
X_train.columns.get_loc('Age')

28

In [20]:
X_subset = X_train.iloc[:, 28:]
# Add intercept
X_const = add_constant(X_subset)

In [21]:
# check for multicollinearity
vif_data = pd.DataFrame({
    'Feature': X_const.columns,
    'VIF': [variance_inflation_factor(X_const.values, i)
            for i in range(X_const.shape[1])]
})
print(vif_data.sort_values('VIF', ascending=False))

  Feature       VIF
0   const  1.421920
3   Parch  1.408773
2   SibSp  1.378020
4    Fare  1.270644
1     Age  1.150392


In [22]:
X_train = X_train.drop(columns=['Parch', 'SibSp'])
X_test = X_test.drop(columns=['Parch', 'SibSp'])

In [23]:
# recheck collinearity
X_subset = X_train.iloc[:, 28:]
X_const = add_constant(X_subset)
vif_data = pd.DataFrame({
    'Feature': X_const.columns,
    'VIF': [variance_inflation_factor(X_const.values, i)
            for i in range(X_const.shape[1])]
})
print(vif_data.sort_values('VIF', ascending=False))

  Feature       VIF
2    Fare  1.015369
1     Age  1.015369
0   const  1.000000


In [24]:
# save everything in one .npz file
np.savez(
    "../data/processed_data.npz",
    X_train=X_train.to_numpy(dtype=np.float64),
    X_test=X_test.to_numpy(dtype=np.float64),
    y_train=y_train.to_numpy(dtype=np.float64),
    y_test=y_test.to_numpy(dtype=np.float64),
    feature_names=X_train.columns.to_numpy()
)

In [25]:
# post preprocessing check
from scipy.stats import skew, kurtosis

In [26]:
# load processed data
data = np.load("../data/processed_data.npz")
X_train, X_test = data["X_train"], data["X_test"]
y_train, y_test = data["y_train"], data["y_test"]

In [27]:
# basic statistics
print("Shape:", X_train[:,28:].shape)
print("Mean:", np.mean(X_train[:,28:], axis=0))
print("Median:", np.median(X_train[:,28:], axis=0))
print("Std deviation:", np.std(X_train[:,28:], axis=0))
print("Min:", np.min(X_train[:,28:], axis=0))
print("Max:", np.max(X_train[:,28:], axis=0))
print("Range:", np.ptp(X_train[:,28:], axis=0))

Shape: (696, 2)
Mean: [-8.67760525e-17 -4.84924999e-17]
Median: [-0.10576199 -0.31341322]
Std deviation: [1. 1.]
Min: [-2.18457452 -1.56944611]
Max: [3.84953443 3.58081291]
Range: [6.03410895 5.15025902]


In [28]:
# count missing values
print("NaN count per column:", np.isnan(X_train[:,28:]).sum(axis=0))

NaN count per column: [0 0]


In [29]:
# percentiles
q25 = np.percentile(X_train[:,28:], 25, axis=0)
q50 = np.percentile(X_train[:,28:], 50, axis=0)
q75 = np.percentile(X_train[:,28:], 75, axis=0)

print("25%:", q25)
print("50% (median):", q50)
print("75%:", q75)

25%: [-0.56214235 -0.91272195]
50% (median): [-0.10576199 -0.31341322]
75%: [0.42668175 0.50263489]


In [30]:
print("Skewness:", skew(X_train[:,28:], axis=0))
print("Kurtosis:", kurtosis(X_train[:,28:], axis=0))

Skewness: [0.47397642 0.97373913]
Kurtosis: [0.90117947 0.29255347]
