In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import warnings
warnings.filterwarnings('ignore')
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
data = [train_df, test_df]

In [50]:
print(train_df.info())
print(test_df.info())
print('missing info for age/cabin')
print('survival rates:')
print('by class:')
print(train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
print('by sex:')
print(train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())
print('by age:')
print(train_df[['Age', 'Sex', 'Survived']].groupby(['Sex'], as_index=False).agg(lambda x: x.mode()[0]))
print(train_df[['Age', 'Sex', 'Survived']].groupby(['Sex'], as_index=False).mean())
# the last 2 are pointless(((((

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [51]:
for d in data:
    d['Title'] = d.Name.str.extract(r' ([A-Za-z]+)\.', expand=False)
for d in data:
    d['Title'] = d['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    d['Title'] = d['Title'].replace('Mlle', 'Miss')
    d['Title'] = d['Title'].replace('Ms', 'Miss')
    d['Title'] = d['Title'].replace('Mme', 'Mrs')
mapping = {"Mr": 1, "Mrs": 2, "Master": 3, "Rare": 4, "Miss": 5}
for d in data:
    d['Title'] = d['Title'].map(mapping)
    d['Title'] = d['Title'].fillna(0)
print(train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
for d in data:
    d['Sex'] = d['Sex'].map( {'female': 1, 'male': 0} ).fillna(0).astype(int)
ages_arr = np.zeros((2,3))
for d in data:
    ov_med = d['Age'].median()
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = d[(d['Sex'] == i) & \
                                  (d['Pclass'] == j+1)]['Age'].dropna()
            if guess_df.empty:
                age = ov_med
            else:
                age = guess_df.median()
            ages_arr[i,j] = int( age/0.5 + 0.5 ) * 0.5

    for i in range(0, 2):
        for j in range(0, 3):
            d.loc[ (d.Age.isnull()) & (d.Sex == i) & \
                    (d.Pclass == j+1),\
                    'Age'] = ages_arr[i,j]

    d['Age'] = d['Age'].astype(int)
print(train_df[['Age', 'Survived']].groupby(['Age'], as_index=False).mean())
for d in data:
    d['Family_size'] = d['SibSp']+d['Parch']+1
for d in data:
    d['Alone'] = 0
    d['Alone'].loc[d['Family_size'] == 1] = 1
print(train_df[['Alone', 'Survived']].groupby(['Alone'], as_index=False).mean())
# the higher change if a person is aloooone
# there is one passenger who does not have value in fare column in test_df.
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
print(train_df['Fare'].describe())
for d in data:
    d.loc[d['Fare'] <= 7.0, 'Fare_New'] = 1
    d.loc[(d['Fare'] > 7.0) & (d['Fare'] <= 39.0), 'Fare_New'] = 2
    d.loc[d['Fare'] > 39.0, 'Fare_New'] = 3
    d['Fare_New'] = d['Fare_New'].astype(int)
print(train_df[['Fare_New', 'Survived']].groupby(['Fare_New'], as_index=False).mean())
print(train_df['Fare_New'].describe())

   Title  Survived
0      1  0.156673
1      2  0.793651
2      3  0.575000
3      4  0.347826
4      5  0.702703
    Age  Survived
0     0  1.000000
1     1  0.714286
2     2  0.300000
3     3  0.833333
4     4  0.700000
..  ...       ...
66   66  0.000000
67   70  0.000000
68   71  0.000000
69   74  0.000000
70   80  1.000000

[71 rows x 2 columns]
   Alone  Survived
0      0  0.505650
1      1  0.303538
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64
   Fare_New  Survived
0         1  0.071429
1         2  0.331858
2         3  0.621622
count    891.000000
mean       2.176207
std        0.456336
min        1.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        3.000000
Name: Fare_New, dtype: float64


In [52]:
train_df = train_df.drop(['Name', 'PassengerId', 'Ticket', 'Cabin', 'Fare', 'SibSp', 'Parch', 'Embarked'], axis=1)
test_df = test_df.drop(['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Fare', 'Embarked'], axis=1)
X_train = train_df.drop('Survived', axis=1)
Y_train = train_df['Survived']
X_test = test_df.drop('PassengerId', axis=1).copy()
print(X_test.head())

   Pclass  Sex  Age  Title  Family_size  Alone  Fare_New
0       3    0   34      1            1      1         2
1       3    1   47      2            2      0         1
2       2    0   62      1            1      1         2
3       3    0   27      1            1      1         2
4       3    1   22      2            3      0         2


In [54]:
m = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=1)
m.fit(X_train, Y_train)
y_pred = m.predict(X_test)
print('Training Accuracy: ', m.score(X_train, Y_train)*100)
print('Cross validation average score: ', cross_val_score(m, X_train, Y_train, cv=10, scoring='accuracy').mean()*100)

Training Accuracy:  84.17508417508418
Cross validation average score:  83.27715355805245


In [55]:
sub = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": y_pred
})
sub.to_csv('../data/gender_submission.csv', index=False)