# Titanic - A Kaggle Submission
## By Michael Neiman

## 0. Data Import

In [21]:
import pandas as pd

X_test = pd.read_csv('titanic_test.csv', header=0)
X_train = pd.read_csv('titanic_train.csv', header=0)
y_train = X_train.pop('survived')
y_train = pd.to_numeric(y_train, errors='coerce')

## 1. Data Preprocessing

Let's first create copies of the data:

In [22]:
# Create copies of the data:
X_train_preprocessed = X_train.copy()
X_test_preprocessed = X_test.copy()

### 1.1. Dropping Irrelevant or hard to use data:

We will remove the following columns:
- passenger_id
- name
- ticket
- cabin (might use later)
- embarked
- boat (might use later)
- home.dest

In [23]:
def drop_irrelevant_columns(df: pd.DataFrame):
    df.drop(['passenger_id', 'name', 'ticket', 'cabin', 'embarked', 'boat', 'home.dest'], axis=1)
    return df

X_test_preprocessed = X_test_preprocessed.drop(columns = ['passenger_id', 'name', 'ticket', 'cabin', 'embarked', 'boat', 'home.dest'])
X_train_preprocessed = X_train_preprocessed.drop(columns = ['passenger_id', 'name', 'ticket', 'cabin', 'embarked', 'boat', 'home.dest'])

### 1.2. Using an ordinal encoder on the "sex" column:

In [24]:
from sklearn.preprocessing import OrdinalEncoder

s = (X_train_preprocessed.dtypes == 'object')
cat_cols = list(s[s].index)

ordinal_encoder = OrdinalEncoder()
X_train_preprocessed[cat_cols] = ordinal_encoder.fit_transform(X_train_preprocessed[cat_cols])
X_test_preprocessed[cat_cols] = ordinal_encoder.transform(X_test_preprocessed[cat_cols])

### 1.4. Replacing the "body" variable:

The order of the bodies bears no significance and might hinder the performance of our model. Thus, we change it to "1" if a number exists and "0" if it doesn't.

In [25]:
def preprocess_body(data: pd.DataFrame):
    data['body'][data['body'].notna()] = 1
    data['body'][data['body'].isna()] = 0
    return data

X_train_preprocessed = preprocess_body(X_train_preprocessed)
X_test_preprocessed = preprocess_body(X_test_preprocessed)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['body'][X['body'].notna()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['body'][X['body'].isna()] = 0



### 1.4. Imputing columns with missing data and mark the rows as such:

In [26]:
cols_with_missing = [col for col in X_train_preprocessed.columns
                     if X_train_preprocessed[col].isnull().any()]

for col in cols_with_missing:
    X_train_preprocessed[col + '_was_missing'] = X_train_preprocessed[col].isnull()
    X_test_preprocessed[col + '_was_missing'] = X_test_preprocessed[col].isnull()

from sklearn.impute import SimpleImputer

imp = SimpleImputer()
X_train_imputed = pd.DataFrame(imp.fit_transform(X_train_preprocessed))
X_test_imputed = pd.DataFrame(imp.fit_transform(X_test_preprocessed))

X_train_imputed.columns = X_train_preprocessed.columns
X_test_imputed.columns = X_test_preprocessed.columns

X_train_preprocessed = X_train_imputed
X_test_preprocessed = X_test_imputed

### 1.5. Scaling the numeric data:

In [27]:
from sklearn.preprocessing import StandardScaler

scaler= StandardScaler().fit(pd.concat([X_train_preprocessed, X_test_preprocessed], axis=0))

X_train_scaled = pd.DataFrame(scaler.transform(X_train_preprocessed), columns=X_train_preprocessed.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_preprocessed), columns=X_test_preprocessed.columns)

X_train_preprocessed = X_train_scaled
X_test_preprocessed = X_test_scaled

## 2. Training and Evaluation.

Now let's try some ML techniques which we've encountered:

### 2.1. Preparing the validation data:

In [28]:
from sklearn.model_selection import train_test_split

X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train_preprocessed, y_train, test_size=0.3, random_state=42)

### 2.2. Training the data on some different classifiers:

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

logistic_regressor = LogisticRegression(random_state=0).fit(X_train_preprocessed, y_train)
xgb_classifier = XGBClassifier(random_state=0).fit(X_train_preprocessed, y_train)
tree_classifier = DecisionTreeClassifier(random_state=0).fit(X_train_preprocessed, y_train)
random_forest_classifier = RandomForestClassifier(random_state=0).fit(X_train_preprocessed, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




### 2.3. Assessing the cross-validation score of each model:

In [30]:
from sklearn.model_selection import cross_val_score

score_logistic = cross_val_score(logistic_regressor, X_train_split, y_train_split)
score_xgb = cross_val_score(xgb_classifier, X_train_split, y_train_split)
score_tree = cross_val_score(tree_classifier, X_train_split, y_train_split)
score_random_forest = cross_val_score(random_forest_classifier, X_train_split, y_train_split)

print(f"logistic: {score_logistic}")
print(f"xgb: {score_xgb}")
print(f"tree: {score_tree}")
print(f"random forest: {score_random_forest}")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


logistic: [0.81512605 0.82352941 0.70588235 0.79831933 0.75630252]
xgb: [0.79831933 0.82352941 0.69747899 0.80672269 0.72268908]
tree: [0.73109244 0.79831933 0.65546218 0.79831933 0.68067227]
random forest: [0.79831933 0.85714286 0.74789916 0.8487395  0.73109244]
