# Titanic competition

### Libraries

In [16]:
import numpy as np
import pandas as pd

In [17]:
data = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [19]:
cat_cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for col in cat_cols:
    print(data[[col, 'Survived']].groupby(col).mean())

        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
        Survived
Sex             
female  0.742038
male    0.188908
       Survived
SibSp          
0      0.345395
1      0.535885
2      0.464286
3      0.250000
4      0.166667
5      0.000000
8      0.000000
       Survived
Parch          
0      0.343658
1      0.550847
2      0.500000
3      0.600000
4      0.000000
5      0.200000
6      0.000000
          Survived
Embarked          
C         0.553571
Q         0.389610
S         0.336957


# Data Preprocessing and Feature Generation

### Feature Generation:
Title - the title of a person (e.g. "Mr." or "Lady.")
<br>pclass_sex = Pclass + Sex
<br>FamilySize = SibSp + Parch + 1
<br>isAlone = {1, if FamilySize == 1; 0 otherwise}

In [20]:
def get_title(fullname):
    name = fullname.split(',')[1]
    title = name.split()[0]
    return title

In [21]:
data['Title'] = data['Name'].apply(get_title)
test['Title'] = test['Name'].apply(get_title)

rare_titles = ['Don.', 'Rev.', 'Dr.', 'Mme.', 'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Col.', 'Capt.', 'the', 'Jonkheer.', 'Dona.']

data_rare_title_filt = data['Title'].isin(rare_titles)
test_rare_title_filt = test['Title'].isin(rare_titles)

data.loc[data_rare_title_filt, 'Title'] = 'Rare'
test.loc[test_rare_title_filt, 'Title'] = 'Rare'

In [22]:
data['pclass_sex'] = data['Pclass'].map(str) + data['Sex'].map(str)
test['pclass_sex'] = test['Pclass'].map(str) + test['Sex'].map(str)

In [23]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [24]:
data['isAlone'] = np.zeros(data.shape[0])

data_alone_mask = (data['FamilySize'] == 1)
data.loc[data_alone_mask, 'isAlone'] = 1

test['isAlone'] = np.zeros(test.shape[0])

test_alone_mask = (test['FamilySize'] == 1)
test.loc[test_alone_mask, 'isAlone'] = 1

### Data Preprocessing
Fill missing values: Age, Fare, Embarked
<br>Normalize: Age, Fare
<br>Categorize: Age, Fare

In [25]:
data['Age'].fillna(data['Age'].median(), inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)

data['Fare'].fillna(data['Fare'].median(), inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)

In [26]:
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)

In [27]:
# normalize age

age_df = pd.concat([data['Age'], test['Age']])

age_max, age_min = age_df.max(), age_df.min()

data_age = data['Age'] / (age_max - age_min)
test_age = test['Age'] / (age_max - age_min)

data['Age'] = data_age
test['Age'] = test_age

In [28]:
# normalize fare

fare_df = pd.concat([data['Fare'], test['Fare']])

fare_max, fare_min = fare_df.max(), fare_df.min()

data_fare = data['Fare'] / (fare_max - fare_min)
test_fare = test['Fare'] / (fare_max - fare_min)

data['Fare'] = data_fare
test['Fare'] = test_fare

In [29]:
# categorize age and fare

categorized_age = np.array(pd.cut(pd.concat([data_age, test_age]), 5), dtype=str)
data['Age'] = categorized_age[:data_age.shape[0]]
test['Age'] = categorized_age[data_age.shape[0]:]

categorized_fare = np.array(pd.cut(pd.concat([data_fare, test_fare]), 5), dtype=str)
data['Fare'] = categorized_fare[:data_fare.shape[0]]
test['Fare'] = categorized_fare[data_fare.shape[0]:]

categorized_numeric = ['Age', 'Fare']

In [30]:
data[['Age', 'Survived']].groupby('Age').mean()

Unnamed: 0_level_0,Survived
Age,Unnamed: 1_level_1
"(0.00113, 0.202]",0.55
"(0.202, 0.402]",0.344168
"(0.402, 0.602]",0.404255
"(0.602, 0.802]",0.434783
"(0.802, 1.002]",0.090909


# Label Encoding

In [31]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [32]:
# le = LabelEncoder()
# for col in categorized_numeric:
#     data[col] = le.fit_transform(data[col])
#     test[col] = le.transform(test[col])

In [33]:
ohe = LabelBinarizer()

ohe_cols = ['pclass_sex', 'Embarked', 'Title'] + categorized_numeric

for col in ohe_cols:
    ohe.fit(data[col].to_numpy())
    
    data_one_hot_col = ohe.transform(data[col].to_numpy())
    test_one_hot_col = ohe.transform(test[col].to_numpy())
    
    cats = data[col].unique()
    for i in range(len(cats)):
        data[cats[i]] = data_one_hot_col.T[i]
        test[cats[i]] = test_one_hot_col.T[i]

# Remove Useless Columns

In [34]:
to_remove = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Pclass', 'Sex'] + ohe_cols

data.drop(to_remove, axis=1, inplace=True)
test.drop(to_remove, axis=1, inplace=True)

y = data['Survived']
data.drop(['Survived'], axis=1, inplace=True)

data.head()

Unnamed: 0,FamilySize,isAlone,3male,1female,3female,1male,2female,2male,S,C,...,Rare,"(0.202, 0.402]","(0.402, 0.602]","(0.602, 0.802]","(0.00113, 0.202]","(0.802, 1.002]","(-0.001, 0.2]","(0.4, 0.6]","(0.2, 0.4]","(0.8, 1.0]"
0,2,0.0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
1,2,0.0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,1,1.0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,2,0.0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,1,1.0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0


# Prediction

### Read Submission .csv

In [35]:
submission_df = pd.read_csv('./datasets/gender_submission.csv', index_col='PassengerId')

### Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
clf = LogisticRegression()
clf.fit(data, y)

LogisticRegression()

In [38]:
predicted = clf.predict(test)

In [39]:
submission_df['Survived'] = predicted

In [40]:
# submission_df.to_csv('./results/log_reg_onehot_all_all.csv')

### Random Forest Classifier

In [46]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=2)
model.fit(data, y)
predictions = model.predict(test)

In [47]:
submission_df['Survived'] = predictions

In [48]:
submission_df.to_csv('./results/random_forest_last.csv')

In [49]:
submission_df['Survived'].value_counts()

0    264
1    154
Name: Survived, dtype: int64

In [45]:
y.value_counts()

0    549
1    342
Name: Survived, dtype: int64