In [6]:
import pandas as pd

In [11]:
def prefix_check(string):
    if string.isdigit():
        return 'None'
    else:
        string = string.replace('.', '').replace('/', '')
        return string

def mean_encode_map(series, target):
    test_df = pd.concat([series, target], axis=1)
    incidence = series.value_counts().to_dict()
    mean_map = test_df.groupby(0).mean().to_dict()['Survived']
    average_value = sum(mean_map.values())/len(mean_map)
    for value in mean_map:
        if incidence[value] < 3:
            mean_map[value] = average_value
    return mean_map
    
def ticket_prefix(train, test):    
    tick = train['Ticket']
    space_split = tick.str.split(expand=True)
    tick_prefix = space_split[0].apply(prefix_check)
    mean_map = mean_encode_map(tick_prefix, train['Survived'])
    train['ticket_prefix'] = tick_prefix.map(mean_map)
    test_prefix = test['Ticket'].str.split(expand=True)[0].apply(prefix_check)
    test['ticket_prefix'] = test_prefix.map(mean_map)
    test['ticket_prefix'].fillna(test['ticket_prefix'].mean(), inplace=True)
    return train, test

def cleanTitanic(path1, path2):
    train = pd.read_csv(path1)
    test = pd.read_csv(path2)
    group_update = [train, test]
    for item in group_update:
        values = {'Age': item['Age'].mean(), 'Embarked':item['Embarked'].mode()[0] }
        item.fillna(value = values, inplace=True)
        item['female'] = pd.get_dummies(item['Sex'])['female']
        emb = pd.get_dummies(item['Embarked'])
        item['Emb Cherbourg'] = emb['C']
        item['Emb Queenstown'] = emb['Q']
        item['Emb Southampton'] = emb['S']
    train, test = ticket_prefix(train, test)
    return train, test
  


In [12]:
data, predict = cleanTitanic('titanic_data/train (1).csv', 'titanic_data/test_2.csv' )
X = data[['Pclass',  'Age', 'SibSp',
       'Parch', 'Fare', 'female', 'Emb Cherbourg',
       'Emb Queenstown', 'Emb Southampton', 'ticket_prefix']]
y = data['Survived']

In [13]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'female',
       'Emb Cherbourg', 'Emb Queenstown', 'Emb Southampton', 'ticket_prefix'],
      dtype='object')

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [37]:
def logistic_fit_and_predict(X_train, X_test, y_train, y_test, predict):
    from sklearn.linear_model import LogisticRegression
    from sklearn import preprocessing
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_scaled = scaler.transform(X_train)
    classifier = LogisticRegression(max_iter=1000, random_state=42)
    classifier.fit(X_scaled, y_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"Training Data Score: {classifier.score(X_scaled, y_train)}")
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
    predict_sub = predict[['Pclass',  'Age', 'SibSp',
       'Parch', 'Fare', 'female', 'Emb Cherbourg',
       'Emb Queenstown', 'Emb Southampton', 'ticket_prefix']]
    predict_scaled = scaler.transform(predict_sub)
    survived = classifier.predict(predict_scaled)
    predictions = pd.DataFrame({'PassengerId':list(predict['PassengerId']), 'Survived':list(survived)})
    return predictions

In [38]:
predictions = logistic_fit_and_predict(X_train, X_test, y_train, y_test, predict)

Training Data Score: 0.7846441947565543
Testing Data Score: 0.651685393258427


In [39]:
def KNN_and_predict(X_train, X_test, y_train, y_test, predict):
    from sklearn.neighbors import KNeighborsClassifier as KNN
    from sklearn import preprocessing
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_scaled = scaler.transform(X_train)
    classifier = KNN(leaf_size= 30, n_neighbors= 10, weights= 'uniform', random_state=42)
    classifier.fit(X_scaled, y_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"Training Data Score: {classifier.score(X_scaled, y_train)}")
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
    predict_sub = predict[['Pclass',  'Age', 'SibSp',
       'Parch', 'Fare', 'female', 'Emb Cherbourg',
       'Emb Queenstown', 'Emb Southampton', 'ticket_prefix']]
    predict_scaled = scaler.transform(predict_sub)
    survived = classifier.predict(predict_scaled)
    predictions = pd.DataFrame({'PassengerId':list(predict['PassengerId']), 'Survived':list(survived)})
    return predictions

In [40]:
predictions = logistic_fit_and_predict(X_train, X_test, y_train, y_test, predict)

Training Data Score: 0.7846441947565543
Testing Data Score: 0.651685393258427


In [41]:
predictions.to_csv('submissions/s10_19_6(Ticket_Cleaned).csv', index=False)