In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load data
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

# Merge train and test
train["TrainSplit"] = 1
test["TestSplit"] = 0
test["Survived"] = None

full = pd.concat([train, test], sort=False)

# Extract title name
full["Title"] = full["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)

# Set all titles to a standart
full["Title"] = full["Title"].replace(["Mlle", "Ms"], "Miss")
full["Title"] = full["Title"].replace("Mme", "Mrs")
full["Title"] = full["Title"].replace(["Lady", "Countess", "Capt", "Col", "Don", "Dr", 
                                       "Major", "Rev", "Sir", "Jonkheer", "Dona"], "Rare")

print(full["Title"].value_counts())

Title
Mr        757
Miss      264
Mrs       198
Master     61
Rare       29
Name: count, dtype: int64


In [7]:
# Check for nulls
print(full.isnull().sum())

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
TrainSplit      418
TestSplit       891
Title             0
dtype: int64


In [3]:
# Use median for Age 
full["Age"].fillna(full["Age"].median(), inplace=True)

# Use median for Fare
full["Fare"].fillna(full["Fare"].median(), inplace=True)

# Use mode for Embark
full["Embarked"].fillna(full["Embarked"].mode()[0], inplace=True)

# Check NaNs
print(full.isnull().sum())

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          0
TrainSplit      418
TestSplit       891
Title             0
dtype: int64


In [4]:
label = LabelEncoder()

for col in ["Sex", "Embarked", "Title"]:
    full[col] = label.fit_transform(full[col])

print(full[["Sex", "Embarked", "Title"]].head(10))


   Sex  Embarked  Title
0    1         2      2
1    0         0      3
2    0         2      1
3    0         2      3
4    1         2      2
5    1         1      2
6    1         2      2
7    1         2      0
8    0         2      3
9    0         0      3


In [None]:
train_df = full[full["TrainSplit"] == 1]
test_df = full[full["TestSplit"] == 0]

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Title"]

X = train_df[features]
y = train_df["Survived"].astype(int)
X_test = test_df[features]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_df[features + ["Survived"]].to_csv("../data/processed/train_clean.csv", index=False)
test_df[features].to_csv("../data/processed/test_clean.csv", index=False)

train_clean = pd.read_csv("../data/processed/train_clean.csv")
test_clean = pd.read_csv("../data/processed/test_clean.csv")

print(train_clean.head())
print(test_clean.head())
