In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


In [2]:
class FalseScaler:
    def __init__(self):
        pass

    def fit(self, X):
        return X

    def transform(self, X):
        return X

    def fit_transform(self, X):
        return X

    def inverse_transform(self, X):
        return X


In [3]:
def load_and_preprocess_data( data: str, drop=[], X_slice=slice(0, -1), y_slice=-1, x_label=[], test_size=0.2,
        y_label=False, columns_to_encode=[], columns_to_scale=[], scale_y=True, random_state=None, shuffle=True, drop_first=True):
    """load dataset from csv file and preprocess it

    Args:
        data (str): path to dataset csv file
        drop (list, optional): columns to drop from dataset. Defaults to [].
        X_slice (slice, optional): independent variables slice. Defaults to slice(0, -1).
        y_slice (int|slice, optional): dependent variables slice. Defaults to -1.
        x_label (list, optional): independent variables to be label encoded. Defaults to [].
        y_label (bool, optional): label encode dependent variables. Defaults to False.
        columns_to_encode (list, optional): columns to get_dummies. Defaults to [].
        columns_to_scale (list, optional): columns to normalize. Defaults to [].
        scale_y (bool, optional): normalize the dependent variables. Defaults to True.
        random_state (int, optional): train test split random state. Defaults to None.
        drop_first (bool, optional): drop the first dummy column. Defaults to True.
    """
    raw_dataset = pd.read_csv(data)
    dataset = raw_dataset.drop(drop, axis=1)
    X = dataset.iloc[:, X_slice]
    y = dataset.iloc[:, y_slice].values

    # X_label_encoder= ColumnTransformer([('Label',LabelEncoder(), x_labels)], remainder='passthrough')
    # X = X_label_encoder.fit_transform(X)

    # imputer_embarked  = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
    # imputer_age = SimpleImputer(missing_values=np.nan, strategy="mean")

    # X['Embarked'] = imputer_embarked.fit_transform(X['Embarked'].values.reshape(-1,1))
    # X['Age'] = imputer_age.fit_transform(X['Age'].values.reshape(-1,1))
    X['Embarked'].fillna('unknown', inplace=True)
    X['Age'].fillna(29, inplace=True)
    # return X
    # print(X.isna().sum().sum())
    # X = X.dropna(axis=0)

    X_label_encoders = {}
    y_label_encoder = LabelEncoder()

    if x_label != []:
        for label in x_label:
            X_label_encoders[label] = LabelEncoder()
            X_label_encoders[label].fit(X[label])
    def X_label_encode(X_new):
        X_new = X_new.copy()
        if x_label != []:
            for label in x_label:
                # X_label_encoders[label] = LabelEncoder()
                X_new[label] = X_label_encoders[label].transform(X_new[label])
        return X_new

    X = X_label_encode(X)

    # X = X_label_encoder.fit_transform(X)
    y_label_encoder.fit(y)

    def y_label_encode(y_new):
        if y_label:
            y_new = y_label_encoder.transform(y_new)
            return y_new
        else:
            return y_new
    y = y_label_encode(y)

    # column_transformer = ColumnTransformer([('OneHotEncode', OneHotEncoder(drop='first'), columns_to_encode)], remainder='passthrough')
    # X = column_transformer.fit_transform(X)
    one_hot_encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
    encoded_columns = []
    if columns_to_encode != []:
        one_hot_encoder.fit(X[columns_to_encode])
        encoded_columns = one_hot_encoder.get_feature_names_out()
    def get_dummies(X_new):
        if columns_to_encode != []:
            X_new_dummies = one_hot_encoder.transform(X_new[columns_to_encode]).toarray()
            # print(X_dummies.shape)
            # print(one_hot_encoder.get_feature_names_out())
        #     X_new = pd.get_dummies(
            X_new = pd.concat([X_new.drop(columns_to_encode,axis=1), pd.DataFrame(data=X_new_dummies,columns=encoded_columns)], axis=1)
        #         X_new, columns=columns_to_encode, drop_first=drop_first)
        return X_new
    X_dummies = get_dummies(X)


    # print(column_transformer
    column_order = X_dummies.columns.values
    
    X_train, X_test, y_train, y_test = X_dummies,X_dummies,y,y
    if test_size > 0:
        X_train, X_test, y_train, y_test = train_test_split(
            X_dummies, y, test_size=test_size, random_state=random_state,shuffle=shuffle)

    # X_scaler = ColumnTransformer(
    #         [('Scaler', StandardScaler(), columns_to_scale)], remainder='passthrough')
    # y_scaler = ColumnTransformer([('Scaler', StandardScaler(), scale_y)], remainder='passthrough')
    X_scaler = StandardScaler()
        
    y_scaler = StandardScaler() if scale_y else FalseScaler()
    X_train_scaled = X_train.copy()
    if columns_to_scale != []:
        # print(X_train)
        X_train_scaled[columns_to_scale] = X_scaler.fit_transform(X_train[columns_to_scale])
    X_train_scaled = X_train_scaled[column_order]
    X_test_scaled = X_test.copy()
    if columns_to_scale != []:
        X_test_scaled[columns_to_scale] = X_scaler.transform(X_test[columns_to_scale])
    X_test_scaled = X_test_scaled[column_order]
    y_train_scaled = y_scaler.fit_transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)
    X_tf_validation, X_tf_test,y_tf_validation, y_tf_test = train_test_split(X_test_scaled,y_test, test_size=0.5)
    y_tf_validation = y_scaler.transform(y_tf_validation)

    def scaler(X_new):
        X_new_scaled = X_new.copy()
        if columns_to_scale != []:
            X_new_scaled[columns_to_scale] = X_scaler.transform(X_new[columns_to_scale])
        return X_new_scaled

    def preprocess(path):
        X_new_raw = pd.read_csv(path)
        X_new = X_new_raw.drop(drop, axis=1)
        # X_new2 = X_new_raw.drop(drop, axis=1).reset_index(drop=True)
        # print(X_new['Sex'].unique())
        X_new['Embarked'].fillna('unknown', inplace=True)
        X_new['Age'].fillna(29, inplace=True)
        X_new['Fare'].fillna(0, inplace=True)
        X_new = X_label_encode(X_new)
        X_new = get_dummies(X_new)
        # X_new.drop(['SibSp_8'],axis=1)
        X_new_scaled = scaler(X_new)
        return {
            "X_test": X_new,
            # "X_test2": X_new2,
            "X_test_raw": X_new_raw,
            "X_test_scaled": X_new_scaled
        }




    return {
        "X":X,
        # "X_raw":X_raw,
        "X_train": X_train,
        "X_dummies": X_dummies,
        "X_train_scaled": X_train_scaled,
        "X_test": X_test,
        "X_test_scaled": X_test_scaled,
        "X_tf_test": X_tf_test,
        "X_tf_validation": X_tf_validation,
        "X_scaler": X_scaler,
        # "X_label_encoder": X_label_encoder,
        "preprocess": preprocess,
        "y":y,
        "y_train": y_train,
        "y_train_scaled": y_train_scaled,
        "y_test": y_test,
        "y_test_scaled": y_test_scaled,
        "y_tf_test": y_tf_test,
        "y_tf_validation": y_tf_validation,
        "y_scaler": y_scaler,
        "y_label_encoder": y_label_encoder,
    }


In [25]:
all_features = ['PassengerId','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']

target_feature = ['Survived']

# train_features = ['Pclass','Sex','Age','Fare']
train_features = ['Pclass','Sex','Age','Fare']


drop_features = [feat for feat in all_features if feat not in train_features]

numerical_features = ['Age','Fare']
# numerical_features = ['Age','Fare']

label_features = ['Sex']

categorical_features = [feat for feat in train_features if feat not in numerical_features+label_features]

# categorical_features_no_label = [feat for feat in categorical_features if feat not in label_features]

print('Drop features: {}\nCategorical: {}\n'.format(drop_features,categorical_features))

Drop features: ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked']
Categorical: ['Pclass']



In [26]:
raw_dataset = pd.read_csv('train.csv')
dataset = raw_dataset
X_male = dataset[dataset['Sex']=='male'][train_features]
y_male = dataset[dataset['Sex']=='male'][target_feature].values
X_female = dataset[dataset['Sex']=='female'][train_features]
y_female = dataset[dataset['Sex']=='female'][target_feature].values

dataset_test = pd.read_csv('test.csv')
test_data_raw_male_base = dataset_test[dataset_test['Sex']=='male']
test_data_raw_female_base = dataset_test[dataset_test['Sex']=='female']
test_data_raw_male = test_data_raw_male_base[train_features]
test_data_raw_female = test_data_raw_female_base[train_features]
# test_data_raw.loc[test_data_raw['Parch']==9,'Parch'] = np.nan
# test_data_raw.iloc[342]
#

In [27]:
test_data_raw_male

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
2,2,male,62.0,9.6875
3,3,male,27.0,8.6625
5,3,male,14.0,9.2250
7,2,male,26.0,29.0000
...,...,...,...,...
407,1,male,50.0,211.5000
413,3,male,,8.0500
415,3,male,38.5,7.2500
416,3,male,,8.0500


In [28]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin

num = make_pipeline(IterativeImputer(max_iter=10, random_state=0),StandardScaler())
cat = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='most_frequent'),OneHotEncoder( drop='first',handle_unknown='ignore', sparse=False))

ct_male = ColumnTransformer([
    ('Numerical', num, numerical_features),
    ('Label', OrdinalEncoder(), label_features), 
    # ('Fill categorical', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), categorical_features),
    ('Categorical', cat, categorical_features),
], remainder='passthrough')
ct_female = ColumnTransformer([
    ('Numerical', num, numerical_features),
    ('Label', OrdinalEncoder(), label_features), 
    # ('Fill categorical', SimpleImputer(missing_values=np.nan, strategy='most_frequent'), categorical_features),
    ('Categorical', cat, categorical_features),
], remainder='passthrough')

X_ct_male = ct_male.fit_transform(X_male)
X_ct_female = ct_female.fit_transform(X_female)

test_data_male = ct_male.transform(test_data_raw_male)
test_data_female = ct_female.transform(test_data_raw_female)

# print(X_ct)


In [29]:

def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names_out()]
        else:
            return estimator.get_feature_names_out(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct.feature_names_in_[features])
                
    return output_features

In [30]:
pd.DataFrame(X_ct_male, 
             columns=get_ct_feature_names(ct_male))

Unnamed: 0,Age,Fare,Sex,Pclass_2,Pclass_3
0,-0.669497,-0.423980,0.0,0.0,1.0
1,0.331012,-0.405419,0.0,0.0,1.0
2,-0.020785,-0.395945,0.0,0.0,1.0
3,1.793295,0.611092,0.0,0.0,0.0
4,-2.208742,-0.103221,0.0,0.0,1.0
...,...,...,...,...,...
572,-0.207723,-0.348575,0.0,1.0,0.0
573,-0.438610,-0.428620,0.0,0.0,1.0
574,-0.284686,-0.290572,0.0,1.0,0.0
575,-0.361648,0.103852,0.0,0.0,0.0


In [31]:
pd.DataFrame(test_data_female, 
             columns=get_ct_feature_names(ct_female)).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       152 non-null    float64
 1   Fare      152 non-null    float64
 2   Sex       152 non-null    float64
 3   Pclass_2  152 non-null    float64
 4   Pclass_3  152 non-null    float64
dtypes: float64(5)
memory usage: 6.1 KB


In [32]:
X_train_male, X_test_male, y_train_male, y_test_male = train_test_split(X_ct_male, y_male, test_size=0.20, random_state=0)

X_test2_male, X_val_male, y_test2_male, y_val_male = train_test_split(X_test_male,y_test_male,test_size=0.5, random_state=0)

X_train_female, X_test_female, y_train_female, y_test_female = train_test_split(X_ct_female, y_female, test_size=0.20, random_state=0)

X_test2_female, X_val_female, y_test2_female, y_val_female = train_test_split(X_test_female,y_test_female,test_size=0.5, random_state=0)

In [33]:
from sklearn.tree import DecisionTreeClassifier
classifier_male = DecisionTreeClassifier(random_state = 0)
classifier_female = DecisionTreeClassifier(random_state = 0)
classifier_male.fit(X_train_male,y_train_male)
classifier_female.fit(X_train_female,y_train_female)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier_male.predict(X_test_male)
cm = confusion_matrix(y_test_male, y_pred)
print(cm)
accuracy_score(y_test_male, y_pred)

[[76 11]
 [17 12]]


0.7586206896551724

In [34]:
from sklearn.ensemble import RandomForestClassifier
classifier2_male = RandomForestClassifier(n_estimators = 1000,)
classifier2_male.fit(X_train_male, y_train_male)
classifier2_female = RandomForestClassifier(n_estimators = 1000,)
classifier2_female.fit(X_train_female, y_train_female)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier2_male.predict(X_test_male)
cm = confusion_matrix(y_test_male, y_pred)
print(cm)
accuracy_score(y_test_male, y_pred)

  classifier2_male.fit(X_train_male, y_train_male)
  classifier2_female.fit(X_train_female, y_train_female)


[[80  7]
 [20  9]]


0.7672413793103449

In [35]:
# Training the K-NN model on the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier4_male = KNeighborsClassifier(n_neighbors = 21, metric = 'minkowski', p = 2)
classifier4_male.fit(X_train_male,y_train_male)
classifier4_female = KNeighborsClassifier(n_neighbors = 21, metric = 'minkowski', p = 2)
classifier4_female.fit(X_train_female,y_train_female)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier4_male.predict(X_test_male)
cm = confusion_matrix(y_test_male, y_pred)
print(cm)
accuracy_score(y_test_male, y_pred)

[[87  0]
 [28  1]]


  return self._fit(X, y)
  return self._fit(X, y)


0.7586206896551724

In [36]:
from sklearn.svm import SVC
classifier3_male = SVC(kernel = 'rbf')
classifier3_male.fit(X_train_male, y_train_male)
classifier3_female = SVC(kernel = 'rbf')
classifier3_female.fit(X_train_female, y_train_female)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier3_male.predict(X_test_male)
cm = confusion_matrix(y_test_male, y_pred)
print(cm)
accuracy_score(y_test_male, y_pred)

[[87  0]
 [26  3]]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.7758620689655172

In [37]:
from sklearn.svm import SVC
classifier5_male = SVC(kernel = 'linear')
classifier5_male.fit(X_train_male, y_train_male)
classifier5_female = SVC(kernel = 'linear')
classifier5_female.fit(X_train_female, y_train_female)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier5_male.predict(X_test_male)
cm = confusion_matrix(y_test_male, y_pred)
print(cm)
accuracy_score(y_test_male, y_pred)

[[87  0]
 [29  0]]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.75

In [38]:
import tensorflow as tf
output_size = 2

batch_size = 50

input_size = 9

max_epochs = 200

hidden_layer_size = 1500

model_male = tf.keras.Sequential([
    # tf.keras.layers.Dense(input_size,activation='relu'),
    # tf.keras.layers.Dense(input_size,activation='sigmoid'),
    # tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    # # tf.keras.layers.Dense(hidden_layer_size,activation='sigmoid'),
    # tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    # tf.keras.layers.Dense(2,activation='relu'),
    # tf.keras.layers.Dense(hidden_layer_size/2,activation='sigmoid'),
    tf.keras.layers.Dense(2,activation='softmax'),
])

model_male.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopper = tf.keras.callbacks.EarlyStopping(patience=5)

model_male.fit(
    X_train_male,
    y_train_male,
    epochs=max_epochs,
    batch_size=batch_size,
    validation_data=(X_val_male, y_val_male),
    verbose=2,
    callbacks=[early_stopper]
    )

Epoch 1/200
10/10 - 4s - loss: 0.5133 - accuracy: 0.8265 - val_loss: 0.5388 - val_accuracy: 0.7414
Epoch 2/200
10/10 - 1s - loss: 0.4644 - accuracy: 0.8265 - val_loss: 0.5555 - val_accuracy: 0.7414
Epoch 3/200
10/10 - 1s - loss: 0.4197 - accuracy: 0.8265 - val_loss: 0.5991 - val_accuracy: 0.7414
Epoch 4/200
10/10 - 1s - loss: 0.4176 - accuracy: 0.8373 - val_loss: 0.5724 - val_accuracy: 0.7414
Epoch 5/200
10/10 - 1s - loss: 0.4009 - accuracy: 0.8460 - val_loss: 0.6070 - val_accuracy: 0.7414
Epoch 6/200
10/10 - 1s - loss: 0.3897 - accuracy: 0.8503 - val_loss: 0.6809 - val_accuracy: 0.7241


<keras.callbacks.History at 0x21a6d1c69a0>

In [39]:
import tensorflow as tf
output_size = 2

batch_size = 50

input_size = 9

max_epochs = 200

hidden_layer_size = 1500

model_female = tf.keras.Sequential([
    # tf.keras.layers.Dense(input_size,activation='relu'),
    # tf.keras.layers.Dense(input_size,activation='sigmoid'),
    # tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    # # tf.keras.layers.Dense(hidden_layer_size,activation='sigmoid'),
    # tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
    # tf.keras.layers.Dense(2,activation='relu'),
    # tf.keras.layers.Dense(hidden_layer_size/2,activation='sigmoid'),
    tf.keras.layers.Dense(2,activation='softmax'),
])

model_female.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopper = tf.keras.callbacks.EarlyStopping(patience=5)

model_female.fit(
    X_train_female,
    y_train_female,
    epochs=max_epochs,
    batch_size=batch_size,
    validation_data=(X_val_female, y_val_female),
    verbose=2,
    callbacks=[early_stopper]
    )

Epoch 1/200
6/6 - 3s - loss: 0.7120 - accuracy: 0.7171 - val_loss: 0.6791 - val_accuracy: 0.8438
Epoch 2/200
6/6 - 1s - loss: 0.6596 - accuracy: 0.7171 - val_loss: 0.5549 - val_accuracy: 0.7500
Epoch 3/200
6/6 - 1s - loss: 0.5264 - accuracy: 0.7410 - val_loss: 0.4536 - val_accuracy: 0.6562
Epoch 4/200
6/6 - 1s - loss: 0.4807 - accuracy: 0.7530 - val_loss: 0.4781 - val_accuracy: 0.7188
Epoch 5/200
6/6 - 1s - loss: 0.4831 - accuracy: 0.6813 - val_loss: 0.4312 - val_accuracy: 0.8750
Epoch 6/200
6/6 - 1s - loss: 0.4539 - accuracy: 0.7450 - val_loss: 0.4710 - val_accuracy: 0.6562
Epoch 7/200
6/6 - 1s - loss: 0.4496 - accuracy: 0.7610 - val_loss: 0.4239 - val_accuracy: 0.6562
Epoch 8/200
6/6 - 1s - loss: 0.4328 - accuracy: 0.7849 - val_loss: 0.3827 - val_accuracy: 0.8438
Epoch 9/200
6/6 - 2s - loss: 0.4286 - accuracy: 0.7291 - val_loss: 0.4360 - val_accuracy: 0.8750
Epoch 10/200
6/6 - 1s - loss: 0.4438 - accuracy: 0.7610 - val_loss: 0.4599 - val_accuracy: 0.6562
Epoch 11/200
6/6 - 1s - loss:

<keras.callbacks.History at 0x21a6f694820>

In [40]:
test_loss, test_accuracy = model_male.evaluate(X_test2_male,y_test2_male)
# test_loss, test_accuracy = model.evaluate(data['X_test_scaled'],data['y_test_scaled'])

print(f'Test loss: {"%.4f"% test_loss}, Accuracy: {"%.2f" % (test_accuracy *100)}%')

Test loss: 0.4523, Accuracy: 79.31%


In [41]:
# y_tf_pred_raw = model.predict(test_data)
# # y_tf_pred_raw = data['y_scaler'].inverse_transform(model.predict(data['X_tf_test']))
# y_tf_pred = np.array([ [np.argmax(x)] for x in y_tf_pred_raw])
# y_tf_pred


In [42]:
# y_tf_pred_raw.shape

In [43]:
data_targets_raw = pd.read_csv('submission_perfect.csv')
data_targets_male = data_targets_raw[data_targets_raw['PassengerId'].isin(test_data_raw_male_base['PassengerId'])]['Survived'].values.reshape(-1,1)
data_targets_female = data_targets_raw[data_targets_raw['PassengerId'].isin(test_data_raw_female_base['PassengerId'])]['Survived'].values.reshape(-1,1)
# data_targets


In [44]:

y_tf_pred_raw_male = model_male.predict(test_data_male)
# y_tf_pred_raw = data['y_scaler'].inverse_transform(model.predict(data['X_tf_test']))
y_tf_pred_male = np.array([ [np.argmax(x)] for x in y_tf_pred_raw_male])
y_tf_pred_raw_female = model_female.predict(test_data_female)
# y_tf_pred_raw = data['y_scaler'].inverse_transform(model.predict(data['X_tf_test']))
y_tf_pred_female = np.array([ [np.argmax(x)] for x in y_tf_pred_raw_female])

results = pd.DataFrame({
    "DecisionTree": [accuracy_score(data_targets_male,classifier_male.predict(test_data_male)),accuracy_score(data_targets_female,classifier_female.predict(test_data_female))],
    "RandomForest": [accuracy_score(data_targets_male,classifier2_male.predict(test_data_male)),accuracy_score(data_targets_female,classifier2_female.predict(test_data_female))],
    "KNeigbors": [accuracy_score(data_targets_male,classifier4_male.predict(test_data_male)), accuracy_score(data_targets_female,classifier4_female.predict(test_data_female))],
    "SVC -rbf": [accuracy_score(data_targets_male,classifier3_male.predict(test_data_male)), accuracy_score(data_targets_female,classifier3_female.predict(test_data_female))],
    "SVC - linear": [accuracy_score(data_targets_male,classifier5_male.predict(test_data_male)), accuracy_score(data_targets_female,classifier5_female.predict(test_data_female))],
    "TensorFlow": [accuracy_score(data_targets_male,y_tf_pred_male), accuracy_score(data_targets_female,y_tf_pred_female)],
}, index=['male', 'female'])

results

Unnamed: 0,DecisionTree,RandomForest,KNeigbors,SVC -rbf,SVC - linear,TensorFlow
male,0.74812,0.770677,0.81203,0.81203,0.804511,0.796992
female,0.723684,0.756579,0.697368,0.710526,0.723684,0.723684


In [45]:
# output = pd.DataFrame({'PassengerId': dataset_test['PassengerId'], 'Survived': y_tf_pred.ravel()})
# output.to_csv('submission.csv', index=False)