In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('data/train.csv')
print(train_df.shape)

(891, 12)


In [3]:
# Remove rows with missing data
# train_df.dropna(subset=['Age'], inplace=True)
# train_df

In [4]:
# Convert values to numbers

def make_numeric(df):
    # Sex: male = 0, female = 1
    df['Sex'] = df['Sex'].map(lambda sex: 0 if sex == 'male' else 1)

    # Embarked: S = 0, C = 1, Q = 3, else = 3
    def map_embarked(embarked):
        if embarked == 'S':
            return 0
        elif embarked == 'C':
            return 1
        elif embarked == 'Q':
            return 2
        return 3

    df['Embarked'] = df['Embarked'].map(map_embarked)

    # Age: replace missing ages with the average
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Fare: replace missing fares with the average
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

make_numeric(train_df)

In [5]:
# Feature engineering

def engineer_features(df):
    # Deck number: letter of cabin to ASCII numeral
    df['Cabin'] = df['Cabin'].fillna('')

    def map_cabin_to_deck(cabin):
        if cabin:
            return ord(cabin[0]) - ord('A') + 1
        return 0

    df['Deck'] = df['Cabin'].map(map_cabin_to_deck)

engineer_features(train_df)

In [6]:
# Select a subset of the columns to work with
get_columns = lambda df: df[['PassengerId', 'Survived', 'Pclass', 'Age', 'Sex', 'Fare', 'Embarked', 'SibSp', 'Parch', 'Deck']]
train_df = get_columns(train_df)
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Age,Sex,Fare,Embarked,SibSp,Parch,Deck
0,1,0,3,22.000000,0,7.2500,0,1,0,0
1,2,1,1,38.000000,1,71.2833,1,1,0,3
2,3,1,3,26.000000,1,7.9250,0,0,0,0
3,4,1,1,35.000000,1,53.1000,0,1,0,3
4,5,0,3,35.000000,0,8.0500,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,13.0000,0,0,0,0
887,888,1,1,19.000000,1,30.0000,0,0,0,2
888,889,0,3,29.699118,1,23.4500,0,1,2,0
889,890,1,1,26.000000,0,30.0000,1,0,0,3


In [7]:
# Train test split
X_train, X_val, y_train, y_val = train_test_split(train_df.drop('Survived', axis=1), train_df['Survived'])

In [8]:
# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [9]:
# Get the training error
model.score(X_train, y_train)

0.7979041916167665

In [10]:
# Get the cross validation error
model.score(X_val, y_val)

0.820627802690583

In [11]:
# What score is doing
prediction = (model.predict(X_val) > 0.5).astype(int)
np.sum(prediction == y_val) / len(y_val)

0.820627802690583

In [12]:
# Determine whether model is overfitting or underfitting
# If cross validation error is greater than the training error, then it is overfitting
# If the errors are very different, then we may need more training examples

ITERS = 100
overfit = 0
total_score_train = 0
total_score_val = 0

for _ in range(ITERS):
    X_train, X_val, y_train, y_val = train_test_split(train_df.drop(['PassengerId', 'Survived'], axis=1), train_df['Survived'])
    
    model = LogisticRegression(max_iter=10_000)
    model.fit(X_train, y_train)
    
    score_train = model.score(X_train, y_train)
    score_val = model.score(X_val, y_val)
    
    total_score_train += score_train
    total_score_val += score_val
    
    if score_train > score_val:
        overfit += 1

print(f'% of models overfitting = {overfit / ITERS}')
print(f'avg training score = {total_score_train / ITERS}')
print(f'avg val score = {total_score_val / ITERS}')

% of models overfitting = 0.64
avg training score = 0.812589820359281
avg val score = 0.8035426008968617


In [13]:
# Train a model on the entire training set without splitting it

X_train = train_df.drop(['PassengerId', 'Survived'], axis=1)
y_train = train_df['Survived']
model = LogisticRegression(max_iter=10_000)
model.fit(X_train, y_train)

score = model.score(X_train, y_train)
print(score)

# Output predictions on test set
test_df = pd.read_csv('data/test.csv')
make_numeric(test_df)
engineer_features(test_df)
test_df = test_df[['PassengerId', 'Pclass', 'Age', 'Sex', 'Fare', 'Embarked', 'SibSp', 'Parch', 'Deck']]

X_test = test_df.drop(['PassengerId'], axis=1)
# X_test[X_test.isna().any(axis=1)]

prediction = (model.predict(X_test) > 0.5).astype(int)

predictions_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': prediction
})

predictions_df.to_csv('test_predictions.csv', index=False)

     PassengerId  Survived  Pclass        Age  Sex     Fare  Embarked  SibSp  \
0              1         0       3  22.000000    0   7.2500         0      1   
1              2         1       1  38.000000    1  71.2833         1      1   
2              3         1       3  26.000000    1   7.9250         0      0   
3              4         1       1  35.000000    1  53.1000         0      1   
4              5         0       3  35.000000    0   8.0500         0      0   
..           ...       ...     ...        ...  ...      ...       ...    ...   
886          887         0       2  27.000000    0  13.0000         0      0   
887          888         1       1  19.000000    1  30.0000         0      0   
888          889         0       3  29.699118    1  23.4500         0      1   
889          890         1       1  26.000000    0  30.0000         1      0   
890          891         0       3  32.000000    0   7.7500         2      0   

     Parch  Deck  
0        0     0  
1

In [14]:
# TODO: try neural network