In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [2]:
yColumn = "Survived"
dfTrain = pd.read_csv("./train.csv")
dfTest = pd.read_csv("./test.csv")

y = dfTrain[yColumn]
dfTrain.drop(yColumn, axis=1, inplace=True)
X_full = dfTrain.append(dfTest, ignore_index=True)

In [3]:
def adjustDataframe(dataFrame):
##  Get name prefix
    namePrefix = dataFrame["Name"].str.split(', ', 1).str[1]
    namePrefix = namePrefix.str.split('. ',1).str[0]
    dataFrame["NamePrefix"] = namePrefix
##  Split ticket into numeric and prefix parts
    spacedTicket = ' ' + dataFrame['Ticket'].astype(str)
    ticketSplit = spacedTicket.str.rsplit(' ',1,expand=True)
    dataFrame["TicketPrefix"] = ticketSplit[0]
    dataFrame["TicketPrefix"] = dataFrame["TicketPrefix"].str.replace(".","")
    dataFrame["TicketPrefix"] = dataFrame["TicketPrefix"].str.replace(" ","")
    dataFrame["TicketNumber"] = ticketSplit[1]  
#Check if there were NaN for those columns
    dataFrame["FamilySize"] = dataFrame["SibSp"] + dataFrame["Parch"]  + 1

## There are multiple cabins per some of passengers - how to proceed?
# For now I will only use deck name and amount of cabins. For later I should check how could I use multiple entries
    dataFrame["CabinDeck"] = dataFrame["Cabin"].str[:1]
    dataFrame["CabinsAmount"] = dataFrame["Cabin"].str.split(" ",-1).str.len()

    del namePrefix, spacedTicket, ticketSplit
    dataFrame.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
    return dataFrame

In [4]:
## Fare and Pclass have highest correlations. Low correlations: Age, Parch.  SibSp and PassengerId correlation < 0.05
#XTrain.corrwith(y)
X_full = adjustDataframe(X_full)
X_full = X_full.replace(r'^\s*$', np.nan, regex=True)

#### Remove Nan values in all relevant columns

In [5]:
# Remove nan values and transform categories into separate column for each column value
numericColumns = ["Pclass", "Age", "FamilySize", "Fare", "CabinsAmount"]
categoricalColumns = ["Sex","NamePrefix", "TicketPrefix", "CabinDeck", "Embarked"]

mean_Imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_numeric = mean_Imputer.fit_transform(X_full[numericColumns])
X_numeric = StandardScaler().fit_transform(X_numeric)
categorical_data = X_full[categoricalColumns].fillna("N/A").astype(str)
encoder = DictVectorizer(sparse = False)
X_categorical = encoder.fit_transform(categorical_data.T.to_dict().values())

In [6]:
# Split data back to train and test
X_train = np.hstack((X_numeric[0:891,:],X_categorical[0:891,:]))
X_final_test = np.hstack((X_numeric[891:1309,:],X_categorical[891:1309,:]))
# Test set from Kaggle does not have prediction result, so I should split training set
X_tmp_train, X_tmp_test, y_tmp_train, y_tmp_test = train_test_split(X_train, y)

# np.corrcoef(X_train[:,1], y)

In [7]:
#param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}
param_grid={ 'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 7, 8, 9, 10]}
cvList = [2,3,4,5,6,7]
for cv in cvList:
    
    #cv = 4
    optimizer = GridSearchCV(LogisticRegression(class_weight='balanced', fit_intercept=True, penalty="l2"), param_grid, cv=cv, n_jobs=-1)
    #optimizer = GridSearchCV(LogisticRegression(class_weight='balanced', fit_intercept=True, penalty="l2"), param_grid, cv = cv, n_jobs=-1)
    # Probably put full training data set here?
    optimizer.fit(X_tmp_train, y_tmp_train)
    # print(optimizer.best_estimator_)
    # print("Score ",optimizer.best_score_)
    optimizer.fit(X_tmp_test, y_tmp_test)
    # optimizer.best_score_
    scoreTrain = optimizer.score(X_tmp_train, y_tmp_train)
    scoreTest = optimizer.score(X_tmp_test, y_tmp_test)
    print("CV ", cv, "Lambda ", optimizer.best_estimator_.C, "Train score ", scoreTrain, "Test score ", scoreTest)
    roc_auc_score(y_tmp_test, optimizer.predict(X_tmp_test))
    # first submission score was 0.757, however final result was 0.77


CV  2 Lambda  9 Train score  0.7889221556886228 Test score  0.8609865470852018
CV  3 Lambda  1 Train score  0.8008982035928144 Test score  0.8430493273542601
CV  4 Lambda  0.1 Train score  0.7889221556886228 Test score  0.8026905829596412
CV  5 Lambda  1 Train score  0.8008982035928144 Test score  0.8430493273542601
CV  6 Lambda  1 Train score  0.8008982035928144 Test score  0.8430493273542601
CV  7 Lambda  1 Train score  0.8008982035928144 Test score  0.8430493273542601


Tests to do:
- test if polynomial features could improve the performance
- combine sibsp and parch into FamilySize column (sibsp+parch+traveller)

In [8]:
## Write answer
answer = pd.DataFrame(columns=["PassengerId", "Survived"])
answer["PassengerId"] = dfTest["PassengerId"]
answer["Survived"] = optimizer.predict(X_final_test)
answer.to_csv("answer.csv", index=False)