In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

In [3]:
original_data = pd.read_csv('data/train.csv')

final_test_data = pd.read_csv('data/test.csv')




In [4]:
# function definitions

def feature_engineering(df):
    # Add FamilySize feature
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # Add IsAlone feature
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

    # Add title feature
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

    # Replace rare titles with 'Rare'
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    # Replace common titles with 'Mr', 'Mrs', 'Miss', 'Master'
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Replace missing age values with median
    df['Age'].fillna(df['Age'].median(), inplace=True)

    # Replace missing values in fare with median
    df['Fare'].fillna(df['Fare'].median(), inplace=True)

    # Replace missing values in embarked with mode
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

    # Add AgeGroup feature
    df['AgeGroup'] = pd.cut(df['Age'], 5)

    # Add FareGroup feature
    df['FareGroup'] = pd.qcut(df['Fare'], 4)

    # Drop unnecessary columns
    df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

    return df

In [5]:
def one_hot_encode(df):
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title'])
    return df

In [6]:
def category_encoding(df):
    df['AgeGroup'] = df['AgeGroup'].cat.codes
    df['FareGroup'] = df['FareGroup'].cat.codes
    return df

In [7]:
def normalize(df):
    standardized_df = df.copy()

    features = ['Age', 'Fare', 'FamilySize']

    scaler = StandardScaler()
    scaler.fit(standardized_df[features])

    standardized_df[features] = scaler.transform(standardized_df[features])

    return standardized_df

In [7]:
training_data = original_data.copy()
training_data = feature_engineering(training_data)
training_data = one_hot_encode(training_data)
training_data = category_encoding(training_data)
training_data = normalize(training_data)

In [8]:
final_test_data_copy = final_test_data.copy()
final_test_data_copy = feature_engineering(final_test_data_copy)
final_test_data_copy = one_hot_encode(final_test_data_copy)
final_test_data_copy = category_encoding(final_test_data_copy)
final_test_data_copy = normalize(final_test_data_copy)



In [14]:
# Split the data into training and pre-testing

from sklearn.model_selection import train_test_split

train_data, pre_test_data = train_test_split(training_data, test_size=0.2, random_state=42)




In [15]:
# Save the pre-test data
pre_test_data.to_csv('data/pre_test.csv', index=False)

# Save the training data
train_data.to_csv('data/train_data.csv', index=False)

In [8]:
# Save the complete training data
training_data.to_csv('data/final_training_data.csv', index=False)



In [None]:
final_test_data_copy.to_csv('data/final_test_data.csv', index=False)

