In [None]:
import os
output_dir = None
data_dir = None

output_dir = output_dir or os.getenv("OUTPUT_DIR", "/app/output_folder")
data_dir   = data_dir   or os.getenv("DATA_DIR",   "/app/data")

os.makedirs(output_dir, exist_ok=True)

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

print('loaded libraries')

loaded libraries


In [None]:
train = pd.read_csv(f'{data_dir}/train.csv')
test  = pd.read_csv(f'{data_dir}/test.csv')
test_raw = pd.read_csv(f'{data_dir}/test.csv')  # keep PassengerId safe
print('loaded data')

loaded data


In [4]:
drop_cols = ['Cabin', 'PassengerId', 'Name', 'Ticket', 'Embarked']
train.drop(columns=drop_cols, inplace=True, errors='ignore')
test.drop(columns=drop_cols,  inplace=True, errors='ignore')

print("dropped ['Cabin', 'PassengerId', 'Name', 'Ticket', 'Embarked'] from data")

num_features = ["Age", "Fare", "SibSp", "Parch", "Pclass"]

scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5, weights="distance")

scaled_train = scaler.fit_transform(train[num_features])
imputed_scaled_train = imputer.fit_transform(scaled_train)
train[num_features] = scaler.inverse_transform(imputed_scaled_train)

scaled_test = scaler.transform(test[num_features])
imputed_scaled_test = imputer.transform(scaled_test)
test[num_features] = scaler.inverse_transform(imputed_scaled_test)

print('imputed missing "Age" values (Method = KNN)')

train.dropna(subset=num_features + ['Sex', 'Survived'], inplace=True)

print('dropped any remaining rows with Null values')

dropped ['Cabin', 'PassengerId', 'Name', 'Ticket', 'Embarked'] from data
imputed missing "Age" values (Method = KNN)
dropped any remaining rows with Null values


In [5]:
y_train = train['Survived']
X_train = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
X_test  = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

X_train = pd.get_dummies(X_train, drop_first=True)
X_test  = pd.get_dummies(X_test,  drop_first=True)
X_test  = X_test.reindex(columns=X_train.columns, fill_value=0)

print('partitioned data into "X_train", "y_train", "X_test"')

partitioned data into "X_train", "y_train", "X_test"


In [6]:
clf = LogisticRegression(max_iter=2000, solver="lbfgs")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc = cross_val_score(clf, X_train, y_train, cv=cv, scoring="accuracy")
print("trained model and used stratified 5-fold cross validation to evaluate")
print(f"CV Accuracy: {acc.mean():.3f} ± {acc.std():.3f}")

trained model and used stratified 5-fold cross validation to evaluate
CV Accuracy: 0.801 ± 0.024


In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('fitted model and predicted on X_test')

predictions = pd.DataFrame({
    "PassengerId": test_raw["PassengerId"].values,
    "Survived": y_pred
})
out_path = os.path.join(output_dir, "predictions_python.csv")
predictions.to_csv(out_path, index=False)
print(f"predictions saved to {out_path}")

fitted model and predicted on X_test
predictions.csv saved to outputs/predictions_python.csv
