In [10]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier


TITANIC_PATH = os.path.join("datasets", "titanic")

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

y_train = train_data['Survived'].values
X_train = train_data.drop('Survived', axis=1)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)


class CustomDataFrameSelector(BaseEstimator, TransformerMixin):
    repl_name = {
        'Rev': 0,
        'Sir': 0,
        'Don': 0,
        'Lady': 0,
        'Jonkheer': 0,
        'the Countess': 0,
        'Ms': 0,
        'Capt': 0,
        'Mme': 0,
        'Mlle': 0,
        'Mr': 1,
        'Mrs': 2,
        'Miss': 3,
        'Master': 4,
        'Dr': 4,
        'Col': 5,
        'Major': 5
    }

    repl_parch = {
        6: 0,
        4: 0,
        5: 1,
        0: 2,
        3: 3,
        1: 4,
        2: 5
    }

    repl_embark = {
        'S': 0,
        'Q': 1,
        'C': 2
    }

    repl_sib = {
        8: 0,
        5: 0,
        4: 1,
        3: 2,
        0: 3,
        2: 4,
        1: 4
    }

    repl_cab = {
        'T': 0,
        'D': 1,
        'E': 1,
        'B': 1,
        'U': 2,
        'F': 3,
        'C': 4,
        'A': 5,
        'G': 6
    }

    repl_sex = {
        'male': 1,
        'famale': 0
    }

    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_in = X.copy()
        X_in['Name'] = X_in['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
        titles = X_in['Name'].unique()

        X_in['Age'].fillna(-1, inplace=True)
        medians = dict()
        for title in titles:
            median = X_in.Age[(X["Age"] != -1) & (X['Name'] == title)].median()
            medians[title] = median

        for index, row in X_in.iterrows():
            if row['Age'] == -1:
                X_in.loc[index, 'Age'] = medians[row['Name']]

        X_in['Fare'].fillna(-1, inplace=True)
        medians = dict()
        for pclass in X_in['Pclass'].unique():
            median = X_in.Fare[(X["Fare"] != -1) & (X_in['Pclass'] == pclass)].median()
            medians[pclass] = median

        for index, row in X_in.iterrows():
            if row['Fare'] == -1:
                X_in.loc[index, 'Fare'] = medians[row['Pclass']]

        X_in['Name'] = X_in['Name'].apply(lambda x: self.repl_name.get(x))
        X_in['Parch'] = X_in['Parch'].apply(lambda x: self.repl_parch.get(x))
        X_in['Embarked'] = X_in['Embarked'].apply(lambda x: self.repl_embark.get(x))
        X_in['SibSp'] = X_in['SibSp'].apply(lambda x: self.repl_sib.get(x))
        X_in['Sex'] = X_in['Sex'].apply(lambda x: self.repl_sex.get(x))

        X_in['Cabin'].fillna('U', inplace=True)
        X_in['Cabin'] = X_in['Cabin'].apply(lambda x: x[0])
        X_in['Cabin'] = X_in['Cabin'].apply(lambda x: self.repl_cab.get(x))


        return X_in[['Age', 'Name', 'Fare', 'Pclass', 'Parch', 'Embarked', 'SibSp', 'Cabin', 'Sex']]


class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

num_pipeline = Pipeline([
        ("preproces_data", CustomDataFrameSelector()),
        ("select_numeric", DataFrameSelector(["Age", "Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

cat_pipeline = Pipeline([
        ("preproces_data", CustomDataFrameSelector()),
        ("select_cat", DataFrameSelector(['Name', 'Pclass', 'Parch', 'Embarked', 'SibSp', 'Cabin', 'Sex'])),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])


preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])




classifier = RandomForestClassifier(bootstrap=False, n_jobs=-1)

pipe = Pipeline([
        ('preprocessing', preprocess_pipeline), 
        ('classifier', classifier)])


seed=4201337
kfold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)



file_mum = 0
for i in range(10):
    X, X_test, y, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    pipe.fit(X, y)

    score = pipe.score(X_test, y_test)
    print(score)
     
    if score > 0.84082:
        submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": pipe.predict(test_data)
        })

        submission.to_csv("submission" + file_mum + ".csv", index=False)
        file_mum+=1






0.8268156424581006
0.8156424581005587
0.8156424581005587
0.7932960893854749
0.8100558659217877
0.8044692737430168
0.8100558659217877
0.8044692737430168
0.8156424581005587
0.8100558659217877
