In [2]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import accuracy_score

In [46]:
# Read in prepped dataset
titanic = pd.read_csv('../data/classification_dataset.csv')
titanic['Survived'] = titanic['Survived'].astype('object')

In [47]:
# Define variables
variables = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 
             'Fare', 'Embarked', 'ticket_count',
             'spouse_present', 'cabin_letter', 'cabin_number', 
             'ticket_letter', 'ticket_number', 'sibling_present']

# Define categorical variables for OneHotEncoder
categorical_variables = ['Sex', 'Embarked', 'cabin_letter', 'ticket_letter']

In [90]:
# Set up train test split
X = titanic[variables]
y = titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42, stratify=y)
y_train = y_train.astype('int')
y_test = y_train.astype('int')

In [91]:
pipe = Pipeline(
    steps = [
        ('ct', ColumnTransformer(
            transformers = [
                ('ohe', OneHotEncoder(sparse=False, drop='first'), categorical_variables)
            ],
            remainder = 'passthrough')),
        ('pf', PolynomialFeatures(interaction_only=True, include_bias=False)),
        ('vt', VarianceThreshold()),
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(max_iter=10000, penalty='l1', solver='saga'))
    ]
)

In [92]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['Sex', 'Embarked',
                                                   'cabin_letter',
                                                   'ticket_letter'])])),
                ('pf',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('vt', VarianceThreshold()), ('scaler', StandardScaler()),
                ('logistic',
                 LogisticRegression(max_iter=10000, penalty='l1',
                                    solver='saga'))])

In [93]:
features = list(pipe['ct'].named_transformers_['ohe'].get_feature_names_out(categorical_variables))
features += [x for x in X_train.columns if x not in categorical_variables]
features = list(pipe['pf'].get_feature_names_out(features))
features = list(np.array(features)[pipe['vt'].get_support()])

survived = 0
idx = list(pipe['logistic'].classes_).index(survived)

coefficients = pd.DataFrame({
    'variable': ['intercept'] + features,
    'coefficient': [pipe['logistic'].intercept_[idx]] + list(pipe['logistic'].coef_[idx])
})


coefficients[coefficients['coefficient'] != 0]

Unnamed: 0,variable,coefficient
0,intercept,-0.621949
1,Sex_male,-0.814376
2,Embarked_Q,0.054842
3,Embarked_S,-0.096923
8,cabin_letter_E,0.013898
...,...,...
545,Fare cabin_number,0.167452
550,ticket_count ticket_number,-0.292957
553,spouse_present ticket_number,0.038662
554,cabin_number ticket_number,-0.200824


In [86]:
from sklearn.model_selection import GridSearchCV

array([-0.62013152])

In [84]:
titanic.shape

(891, 17)