# Cleaning up the notebook

Let's start by putting our "final product" code into one cell.

In [None]:
import pandas
import pandas._typing
import sklearn.base
import sklearn.ensemble
import sklearn.model_selection


train_raw = pandas.read_csv("train.csv")
test_raw = pandas.read_csv("test.csv")

train = train_raw.set_index("PassengerId")
test = test_raw.set_index("PassengerId")
y = train["Survived"]

columns = ["Pclass", "Sex", "Age", "Fare"]
X = train[columns]

# TODO use an actual encoder
X.loc[X["Sex"] == "male", :] = 1
X.loc[X["Sex"] == "female", :] = 0


def train_test_model(
        X: pandas._typing.ArrayLike,
        y: pandas._typing.ArrayLike,  # TODO score column as a param and pass full DF?
        model: sklearn.base.BaseEstimator,
        # TODO scorer as a param?
    ) -> (sklearn.base.BaseEstimator, float):
    """
    
    Args:
        X: Full training set
        y: Full test set
    
    Returns:
        Tuple of the trained estimator and the accuracy.
    """
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=0)
    model.fit(X_train, y_train)
    accuracy = sklearn.metrics.accuracy_score(model.predict(X_test), y_test)
    return model, accuracy


estimator, accuracy = train_test_model(X, y, sklearn.ensemble.RandomForestClassifier(random_state=0))
X_test = test[columns]
X_test.loc[X_test["Sex"] == "male", :] = 1
X_test.loc[X_test["Sex"] == "female", :] = 0
preds = estimator.predict(X_test)

outputs = X_test.copy()

outputs["Survived"] = preds
outputs = outputs["Survived"]
outputs.to_csv("preds.csv")
!head preds.csv

In [None]:
import pandas
import pandas._typing
import sklearn.base
import sklearn.ensemble
import sklearn.model_selection

import utils


train_raw = pandas.read_csv("train.csv")
test_raw = pandas.read_csv("test.csv")

train = train_raw.set_index("PassengerId")
test = test_raw.set_index("PassengerId")
y = train["Survived"]

columns = ["Pclass", "Sex", "Age", "Fare"]
X = train[columns]

# TODO use an actual encoder
X.loc[X["Sex"] == "male", :] = 1
X.loc[X["Sex"] == "female", :] = 0

estimator, accuracy = utils.train_test_model(X, y, sklearn.ensemble.RandomForestClassifier(random_state=0))
X_test = test[columns]
X_test.loc[X_test["Sex"] == "male", :] = 1
X_test.loc[X_test["Sex"] == "female", :] = 0
preds = estimator.predict(X_test)

outputs = X_test.copy()

outputs["Survived"] = preds
outputs = outputs["Survived"]
outputs.to_csv("preds.csv")
!head preds.csv

Let's get the actual encoder in next.

In [None]:
import pandas
import sklearn.compose
import sklearn.ensemble
import sklearn.pipeline
import sklearn.preprocessing

import utils


train_raw = pandas.read_csv("train.csv")
test_raw = pandas.read_csv("test.csv")

train = train_raw.set_index("PassengerId")
test = test_raw.set_index("PassengerId")
y = train["Survived"]

columns = ["Pclass", "Sex", "Age", "Fare"]
X = train[columns]

categorical_encoder = sklearn.preprocessing.OrdinalEncoder()
transformers = sklearn.compose.make_column_transformer((categorical_encoder, ["Sex"]))
pipeline = sklearn.pipeline.make_pipeline(transformers, sklearn.ensemble.RandomForestClassifier(random_state=0))

estimator, accuracy = utils.train_test_model(X, y, pipeline)
X_test = test[columns]
preds = estimator.predict(X_test)

outputs = X_test.copy()

outputs["Survived"] = preds
outputs = outputs["Survived"]
outputs.to_csv("preds.csv")
!head preds.csv

Next I will break this out into functions, as the current code is a bit of an eyesore.

In [None]:
import pandas
import sklearn.compose
import sklearn.ensemble
import sklearn.pipeline
import sklearn.preprocessing

import utils


def get_train_and_test_data():
    train_raw = pandas.read_csv("train.csv")
    test_raw = pandas.read_csv("test.csv")
    train = train_raw.set_index("PassengerId")
    test = test_raw.set_index("PassengerId")
    y = train["Survived"]
    columns = ["Pclass", "Sex", "Age", "Fare"]
    X = train[columns]
    return X, y, test


def make_pipeline():
    categorical_encoder = sklearn.preprocessing.OrdinalEncoder()
    transformers = sklearn.compose.make_column_transformer(
        (categorical_encoder, ["Sex"])
    )
    pipeline = sklearn.pipeline.make_pipeline(
        transformers, sklearn.ensemble.RandomForestClassifier(random_state=0)
    )
    return pipeline

    
X_train, y_train, test = get_train_and_test_data()
estimator, accuracy = utils.train_test_model(X, y, make_pipeline())

X_test = test[columns]
preds = estimator.predict(X_test)

outputs = X_test.copy()
outputs["Survived"] = preds
outputs = outputs["Survived"]
outputs.to_csv("preds.csv")
!head preds.csv

These functions are still pretty hard-coded and brittle (not portable at all), and could be abstracted out a little more to make them more flexible. 

But there's a few things I can fix before abstracting it a little more:

1. `columns` should be a module-level constant, as should `"Survived"`.
2. Creating the result frame and writing it to the file system should be in its own function.

Let's do those.

In [None]:
import pandas
import pandas._typing
import sklearn.base
import sklearn.compose
import sklearn.ensemble
import sklearn.pipeline
import sklearn.preprocessing

import utils


INDEX_COLUMN = "PassengerId"
FEATURE_COLUMNS = ["Pclass", "Sex", "Age", "Fare"]
TARGET_COLUMN = "Survived"


def get_train_and_test_data() -> (pandas.DataFrame, pandas.Series, pandas.DataFrame):
    """Get the train and test data.
    
    Returns:
        Tuple of (X_train, y_train, X_test)
    """
    train_raw = pandas.read_csv("train.csv")
    test_raw = pandas.read_csv("test.csv")
    train = train_raw.set_index(INDEX_COLUMN)
    test = test_raw.set_index(INDEX_COLUMN)
    y = train[TARGET_COLUMN]
    X = train[FEATURE_COLUMNS]
    return X, y, test


def make_pipeline() -> sklearn.base.BaseEstimator:
    """Make the pipeline, which includes transformers for categorical variables and the model itself.
    
    Returns:
        The pipeline.
    """
    categorical_encoder = sklearn.preprocessing.OrdinalEncoder()
    transformers = sklearn.compose.make_column_transformer(
        (categorical_encoder, ["Sex"])
    )
    pipeline = sklearn.pipeline.make_pipeline(
        transformers, sklearn.ensemble.RandomForestClassifier(random_state=0)
    )
    return pipeline


def save_predictions_to_csv(
        X_test: pandas.DataFrame,
        preds: pandas._typing.ArrayLike,
    ):
    """Save the predictions to a CSV named "preds.csv".
    
    Args:
        X_test: the test set, without predictions.
        preds: the predictions from the model.
    
    Side Effects:
        Save the predictions for each variable to a CSV.
    """
    outputs = X_test.copy()
    outputs[TARGET_COLUMN] = preds
    outputs = outputs[TARGET_COLUMN]
    outputs.to_csv("preds.csv")

    
def main():   
    X_train, y_train, test = get_train_and_test_data()
    estimator, accuracy = utils.train_test_model(X, y, make_pipeline())

    X_test = test[columns]
    preds = estimator.predict(X_test)
    save_predictions_to_csv(X_test, preds)
    
    
main()
!head preds.csv

Some more changes I see are that I could refactor these functions to take the features, index, and target columns as parameters later. Or not.

There's reasonable arguments for either approach (see Robert C. Martin's _Clean Code_).

However, there's still model creation and training logic in `main`. Let's get that wrapped up in its own function first.

In [6]:
import pandas
import pandas._typing
import sklearn.base
import sklearn.compose
import sklearn.ensemble
import sklearn.pipeline
import sklearn.preprocessing

import utils


INDEX_COLUMN = "PassengerId"
FEATURE_COLUMNS = ["Pclass", "Sex", "Age", "Fare"]
TARGET_COLUMN = "Survived"


def get_train_and_test_data() -> (pandas.DataFrame, pandas.Series, pandas.DataFrame):
    """Get the train and test data.
    
    Returns:
        Tuple of (X_train, y_train, X_test)
    """
    train_raw = pandas.read_csv("train.csv")
    test_raw = pandas.read_csv("test.csv")
    train = train_raw.set_index(INDEX_COLUMN)
    test = test_raw.set_index(INDEX_COLUMN)
    X_train = train[FEATURE_COLUMNS]
    y_train = train[TARGET_COLUMN]
    X_test = test[FEATURE_COLUMNS]
    return X_train, y_train, X_test


def make_pipeline() -> sklearn.base.BaseEstimator:
    """Make the pipeline, which includes transformers for categorical variables and the model itself.
    
    Returns:
        The pipeline.
    """
    categorical_encoder = sklearn.preprocessing.OrdinalEncoder()
    transformers = sklearn.compose.make_column_transformer(
        (categorical_encoder, ["Sex"])
    )
    pipeline = sklearn.pipeline.make_pipeline(
        transformers, sklearn.ensemble.RandomForestClassifier(random_state=0)
    )
    return pipeline


def train_and_predict(X_train, y_train, X_test) -> pandas._typing.ArrayLike:
    """Create a model, then train it and make predictions on the test set with it.
    """
    # TODO log accuracy
    estimator, accuracy = utils.train_test_model(X_train, y_train, make_pipeline())
    preds = estimator.predict(X_test)
    return preds


def save_predictions_to_csv(
        X_test: pandas.DataFrame,
        preds: pandas._typing.ArrayLike,
    ):
    """Save the predictions to a CSV named "preds.csv".
    
    Args:
        X_test: the test set, without predictions.
        preds: the predictions from the model.
    
    Side Effects:
        Save the predictions for each variable to a CSV.
    """
    outputs = X_test.copy()
    outputs[TARGET_COLUMN] = preds
    outputs = outputs[TARGET_COLUMN]
    outputs.to_csv("preds.csv")

    
def main():
    X_train, y_train, X_test = get_train_and_test_data()
    preds = train_and_predict(X_train, y_train, X_test)
    save_predictions_to_csv(X_test, preds)
    
    
main()
!head preds.csv

PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1


Look at how clean that is. 

I'm glad with this as an hour's work.