In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
data = [train_df, test_df]

In [66]:
print(train_df.info())
print(test_df.info())
print('missing info for age/cabin')
print('survival rates:')
print('by class:')
print(train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
print('by sex:')
print(train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())
print('by age:')
print(train_df[['Age', 'Sex', 'Survived']].groupby(['Sex'], as_index=False).agg(lambda x: x.mode()[0]))
print(train_df[['Age', 'Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())
# the last 2 are pointless(((((

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [67]:
for d in data:
    d['Title'] = d.Name.str.extract(r' ([A-Za-z]+)\.', expand=False)
for d in data:
    d['Title'] = d['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    d['Title'] = d['Title'].replace('Mlle', 'Miss')
    d['Title'] = d['Title'].replace('Ms', 'Miss')
    d['Title'] = d['Title'].replace('Mme', 'Mrs')
mapping = {"Mr": 1, "Mrs": 2, "Master": 3, "Rare": 4, "Miss": 5}
for d in data:
    d['Title'] = d['Title'].map(mapping)
    d['Title'] = d['Title'].fillna(0)
print(train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
for d in data:
    d['Sex'] = d['Sex'].map( {'female': 1, 'male': 0} ).fillna(0).astype(int)
print(train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
print(train_df['Fare'].describe())
for d in data:
    d.loc[d['Fare'] <= 7.0, 'Fare_New'] = 1
    d.loc[(d['Fare'] > 7.0) & (d['Fare'] <= 39.0), 'Fare_New'] = 2
    d.loc[d['Fare'] > 39.0, 'Fare_New'] = 3
    d['Fare_New'] = d['Fare_New'].astype(int)
print(train_df[['Fare_New', 'Survived']].groupby(['Fare_New'], as_index=False).mean())
print(train_df['Fare_New'].describe())
freq = train_df.Embarked.dropna().mode()[0]
for d in data:
    d['Embarked'] = d['Embarked'].fillna(freq)
    d['Embarked'] = d['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
print(train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())
for d in data:
    d['Deck'] = d['Cabin'].str[0]
    d['Deck'] = d['Deck'].fillna('U')
    d['Deck'] = d['Deck'].map({'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'T':8, 'U':0})
print(train_df[['Deck', 'Survived']].groupby(['Deck'], as_index=False).mean())
for d in data:
    d['Surname'] = d.Name.str.extract('([A-Za-z]+),', expand=False)
le = LabelEncoder()
le.fit(pd.concat([train_df['Surname'], test_df['Surname']]).astype(str))
for d in data:
    d['Surname'] = le.transform(d['Surname'].astype(str))
for d in data:
    d['Family_Group'] = d['Surname'].astype(str) + '_' + d['Fare_New'].astype(str)
family_survival = train_df.groupby('Family_Group')['Survived'].mean()
train_df['Family_Survival'] = 0.5
for idx, row in train_df.iterrows():
    family_mask = (train_df['Surname'] == row['Surname']) & (train_df.index != idx)
    if family_mask.sum() > 0:
        train_df.loc[idx, 'Family_Survival'] = train_df.loc[family_mask, 'Survived'].mean()
family_survival_dict = train_df.groupby('Surname')['Survived'].mean().to_dict()
test_df['Family_Survival'] = test_df['Surname'].map(family_survival_dict).fillna(0.5)
surname_counts = pd.concat([train_df['Surname'], test_df['Surname']]).value_counts()
for d in data:
    d['Surname_Count'] = d['Surname'].map(surname_counts)
    d['Surname_Count'] = d['Surname_Count'].fillna(1)
train_df = train_df.drop(['Surname'], axis=1)
test_df = test_df.drop(['Surname'], axis=1)
print(train_df[['Surname_Count', 'Survived']].groupby(['Surname_Count'], as_index=False).mean())

   Title  Survived
0      1  0.156673
1      2  0.793651
2      3  0.575000
3      4  0.347826
4      5  0.702703
   Sex  Survived
0    0  0.188908
1    1  0.742038
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64
   Fare_New  Survived
0         1  0.071429
1         2  0.331858
2         3  0.621622
count    891.000000
mean       2.176207
std        0.456336
min        1.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        3.000000
Name: Fare_New, dtype: float64
   Embarked  Survived
0         0  0.339009
1         1  0.553571
2         2  0.389610
   Deck  Survived
0     0  0.299854
1     1  0.466667
2     2  0.744681
3     3  0.593220
4     4  0.757576
5     5  0.750000
6     6  0.615385
7     7  0.500000
8     8  0.000000
   Surname_Count  Survived
0              1  0.332584
1              2  0.473373
2              3  0.

In [68]:
train_df = train_df.drop(['Name', 'PassengerId', 'Ticket', 'Cabin', 'Fare', 'SibSp', 'Parch','Family_Group', 'Age'], axis=1)
test_df = test_df.drop(['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Fare','Family_Group', 'Age'], axis=1)
X_train = train_df.drop('Survived', axis=1)
Y_train = train_df['Survived']
X_test = test_df.drop('PassengerId', axis=1).copy()
print(X_test.head())

   Pclass  Sex  Embarked  Title  Fare_New  Deck  Family_Survival  \
0       3    0         2      1         2     0             0.75   
1       3    1         0      2         1     0             0.50   
2       2    0         2      1         2     0             0.50   
3       3    0         0      1         2     0             0.50   
4       3    1         0      2         2     0             1.00   

   Surname_Count  
0              5  
1              1  
2              1  
3              1  
4              2  


In [69]:
m = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
m.fit(X_train, Y_train)
y_pred = m.predict(X_test)
print('Training Accuracy: ', m.score(X_train, Y_train)*100)
print('Cross validation average score: ', cross_val_score(m, X_train, Y_train, cv=10, scoring='accuracy').mean()*100)

clf = RandomForestClassifier(random_state=10)

Training Accuracy:  85.18518518518519
Cross validation average score:  83.61173533083645


In [70]:
sub = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": y_pred
})
sub.to_csv('../data/gender_submission.csv', index=False)