In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
from sklearn_pandas import DataFrameMapper
import pandas as pd

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin

class WithinGroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_var):
        self.group_var = group_var
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        # the copy leaves the original dataframe intact
        X_ = pd.DataFrame(X.copy())
        for col in X_.columns:
            if X_[col].dtypes == 'float64':
                X_.loc[(X[col].isna()) & X_[self.group_var].notna(), col] = X_[self.group_var].map(X_.groupby(self.group_var)[col].mean())
                X_[col] = X_[col].fillna(X_[col].mean())
        return X_

In [5]:
def load_data():
    data_url = "https://raw.githubusercontent.com/Mjboothaus/titanic/main/data"
    titanic_train = pd.read_csv(f"{data_url}/train.csv")
    titanic_test= pd.read_csv(f"{data_url}/test.csv")
    return titanic_train, titanic_test

In [6]:
titanic_train, titanic_test = load_data()

In [7]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
titanic_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [9]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [10]:
drop_fields = ["PassengerId", "Name", "Ticket", "Fare", "Cabin", "Embarked"]

# Dropped "Embarked" as it should have nothing to do with passenger survival or otherwise
# Probably also "Cabin" -- assume there is a relationship with "Pclass"

In [11]:
titanic_train.drop(drop_fields, axis=1, inplace=True)

In [12]:
titanic_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0


In [13]:
features = ["Pclass", "Sex", "Age", "SibSp", "Parch"]

In [14]:
X = titanic_train[features]

In [15]:
y = titanic_train[["Survived"]]

In [16]:
X.isna().sum()

Pclass      0
Sex         0
Age       177
SibSp       0
Parch       0
dtype: int64

In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    int64  
 1   Sex     891 non-null    object 
 2   Age     714 non-null    float64
 3   SibSp   891 non-null    int64  
 4   Parch   891 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 34.9+ KB


In [18]:
X["Sex_encode"] = X["Sex"].map({"male": 0, "female": 1})
X.drop("Sex", axis=1, inplace=True)

In [19]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_encode
0,3,22.0,1,0,0
1,1,38.0,1,0,1
2,3,26.0,0,0,1
3,1,35.0,1,0,1
4,3,35.0,0,0,0


In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      891 non-null    int64  
 1   Age         714 non-null    float64
 2   SibSp       891 non-null    int64  
 3   Parch       891 non-null    int64  
 4   Sex_encode  891 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 34.9 KB


In [21]:
y.isna().sum()

Survived    0
dtype: int64

So need to do something with the 177/891 n/a values for age

In [22]:
# Why don't we impute using the dataframe-wide median? Because data contains heterogeneous groups 
#    e.g. 1st-class Titanic passengers tended to be older than 3rd-class passengers

print("Median values by Pclass\n")
print(titanic_train.groupby("Pclass")["Age"].median())
print("\nStandard Deviations by Pclass")
print(titanic_train.groupby("Pclass")["Age"].std(), "\n")
print("Median / Standard deviation for entire population")
print(titanic_train["Age"].median(), titanic_train["Age"].std())

Median values by Pclass

Pclass
1    37.0
2    29.0
3    24.0
Name: Age, dtype: float64

Standard Deviations by Pclass
Pclass
1    14.802856
2    14.001077
3    12.495398
Name: Age, dtype: float64 

Median / Standard deviation for entire population
28.0 14.526497332334042


In [23]:
from sklearn.impute import SimpleImputer

In [34]:
mapper = DataFrameMapper([("Pclass", StandardScaler(), {'suffix': '_std_scale'}),
            ("Age", WithinGroupMeanImputer(group_var="Pclass"), {'suffix': '_median_imputed'}),
            ("SibSp", None),
            ("Parch", None),
            ("Sex_encode", None)], 
            df_out=True)

# SimpleImputer(strategy="median")

In [35]:
mapper

In [36]:
X_mapped = mapper.transform(X)

AttributeError: 'DataFrameMapper' object has no attribute 'built_features'

In [None]:
mapper.transformed_names_

In [None]:
X_mapped.info()

In [None]:
y.shape

In [None]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_mapped, y)
clf.predict(X_mapped)
clf.score(X_mapped, y)

In [None]:
len(clf.predict(X_mapped))