# Preproccesing

In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OrdinalEncoder, MinMaxScaler
import joblib

In [2]:
def power(x):
    return x ** 0.6

In [3]:
def sum_family(X):
   return (X[:, 0] + X[:, 1]).reshape(-1, 1)

In [4]:
def family_feature_names_out(input_features):
    return ["FamilySize"]

In [5]:
attributes_to_drop = ['Name', 'Ticket', 'Cabin', 'Embarked']
attributes_to_sum = ['SibSp', 'Parch']

In [6]:
age_pipeline = make_pipeline(
    IterativeImputer(),
    FunctionTransformer(power),
    MinMaxScaler(feature_range=(0, 1))
)

In [7]:
preprocessing = ColumnTransformer([
    ("drop1", "drop", ["Name", "Ticket", "Cabin", "Embarked"]),
    ("fare", MinMaxScaler(feature_range=(0, 1)), ["Fare"]),
    ("1hotencoder", OrdinalEncoder(), ["Sex"]),
    ("family", FunctionTransformer(sum_family, validate=True, feature_names_out=family_feature_names_out), ["SibSp", "Parch"]),
    ("drop2", "drop", ["SibSp", "Parch"]),
    ("age", age_pipeline, ["Age"])
],
remainder='passthrough',
force_int_remainder_cols = False)

# Predictions

In [8]:
model_reloaded = joblib.load('random_forest.pkl')

In [9]:
test_data = pd.read_csv('../titanic/test.csv')

In [10]:
passenger_id = test_data['PassengerId'].copy()
test_data.drop('PassengerId', axis=1, inplace=True)

In [11]:
predictions = model_reloaded.predict(test_data)

In [12]:
predictions_df = pd.DataFrame(zip(passenger_id, predictions), columns=['PassengerId', 'Survived'])

In [13]:
predictions_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [14]:
predictions_df.to_csv('predictions.csv', index=False)