In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

import seaborn as sns

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

original_train = pd.read_csv('../data/train.csv')

In [None]:
train.head(n=20)

### Categorizing data

#### Important
* pclass
* sex
* age
* fare
* embarked
* cabin

#### Usefull
* sibsp
* parch
* name
* ticket

`Name` and `Ticket number` may seem useless, but they can help us learn about families. Because family members are more likely to have same survival conditions.

In [None]:
train.isnull().sum()

# Filling missing values

## 1. Age

In [None]:
train['Age'].describe()

In [None]:
train['Age'].hist()

In [None]:
train.groupby('Pclass')['Age'].describe()

In [None]:
train.groupby('Sex')['Age'].describe()

In [None]:
train.groupby(['Pclass', 'Sex'])['Age'].describe()

In [None]:
sns.boxplot(x='Pclass', y='Age', hue='Sex', data=train)
plt.show()

We can see that `Age` is dependent to `Pclass` and `Sex`.\
Femails are mostly younger and people in the 1st class are older than people in other two classes.

So it's better to fill missing `Age` values using the median age within each (Sex, Pclass) group, not one global value.

In [None]:
train.groupby(['Pclass', 'Sex'])['Age'].median()

In [None]:
train['Age'] = train['Age'].fillna(
    train.groupby(['Pclass', 'Sex'])['Age'].transform('median')
)

train['Age'].isnull().sum()

In [None]:
test['Age'].isnull().sum()

In [None]:
test['Age'] = test['Age'].fillna(
    test.groupby(['Pclass', 'Sex'])['Age'].transform('median')
)

test['Age'].isnull().sum()

Let's move on to the next missing values.

In [None]:
train.isnull().sum()

## 2. Embarked

In [None]:
train['Embarked'].value_counts()

The mode is **S**. So it's the best option for missing values.

In [None]:
train['Embarked'] = train['Embarked'].fillna(
    train['Embarked'].mode()[0]
)

train['Embarked'].isnull().sum()

In [None]:
test['Embarked'].isnull().sum()

In [None]:
train.isnull().sum()

## 3. Cabin

In [None]:
train['Cabin'].head(20)

This `NaN`s can mean two things:
1. There's no information about the passenger's cabin.
2. The passenger didn't have any cabin.

Since there isn't enough cabin for all passengers, there was absolutely some passengers without cabins.\
So instead of filling nulls with some features like median or mode, let's consider that they mean the passenger didn't have any cabins.

In [None]:
train['HasCabin'] = train['Cabin'].notnull().astype(int)

train['HasCabin'].head()

In [None]:
train['Cabin'].dropna().head(10)

The letter of each cabin is important to us, not its number.\
So let's create a new column called `Deck`.

In [None]:
train['Deck'] = np.where(train['Cabin'].notna(),
                         train['Cabin'].str[0],
                         '0')

In [None]:
train['Deck'].value_counts()

We created a new column called `Deck`. Then filled it with the letter of each cabin. If the `Cabin` was null, we filled it with value 0 meaning this passanger didn't have any cabin. (maybe)

But some passengers may have more than just one cabin. So let's add another column.

In [None]:
train['CabinCount'] = np.where(train['Cabin'].notna(),
                               train['Cabin'].str.split().str.len(),
                               0)

In [None]:
train['CabinCount'].value_counts()

Let's check if there's a realtionship between `CabinCount` and `Survived`.

In [None]:
train.groupby('CabinCount')['Survived'].mean()

Now we don't need the `Cabin` column.

In [None]:
train = train.drop(columns=['Cabin'])

train.head()

In [None]:
train.isnull().sum()

In [None]:
# Do all the above steps for test

test['HasCabin'] = test['Cabin'].isna().astype(int)

test['Deck'] = np.where(test['Cabin'].notna(),
                        test['Cabin'].str[0],
                        '0')

test['CabinCount'] = np.where(test['Cabin'].notna(),
                              test['Cabin'].str.split().str.len(),
                              0)

test = test.drop(columns=['Cabin'])

test.isnull().sum()

# Prepairing the dataset for training the model

In [None]:
y = train['Survived'] # target value
y.head()

for the X, we should drop out some columns that won't help much (or confuse it) in training the model.

In [None]:
X = train.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket'])
X.head()

Now we have to change the data types from strings to categories.

In [None]:
X.dtypes

In [None]:
X = pd.get_dummies(X, drop_first=True)
X.head()

In [None]:
X.dtypes

In [None]:
X = X.astype(int)
X.head()

In [None]:
X.dtypes

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42)

# Training the model

I have chosen two models form the scikit learn cheatsheet. `LogisticRegression` and `LinearSVC`.

## LogisticRegression

In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

In [None]:
log_preds = log_model.predict(X_val)

## LinearSVC

In [None]:
svc_model = LinearSVC(max_iter=5000)
svc_model.fit(X_train, y_train)

In [None]:
svc_preds = svc_model.predict(X_val)

## Compairing the results

In [None]:
print('LogisticRegression accuracy: ', accuracy_score(y_val, log_preds))
print('LinearSVC accuracy: ', accuracy_score(y_val, svc_preds))