In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install pydotplus

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

import itertools

from sklearn.preprocessing import LabelBinarizer

sns.set(style='white', context='notebook', palette='deep')

In [4]:
test_data_split = 0.3
random_seed = 2

In [5]:
# Load the data
df_train = pd.read_csv("../input/titanic/train.csv")
df_test = pd.read_csv("../input/titanic/test.csv")
df_result = df_test.copy()

In [8]:
target = "Survived"

# ***EDA***

In [9]:
df_train.info()

In [10]:
df_train.shape

In [11]:
df_train.head(100)

In [12]:
df_train.columns.values

In [13]:
df_train.head()

In [14]:
print('\n\nSex Ratio of Passengers')
print(df_train['Sex'].value_counts(normalize=True))

In [15]:
df_train[target].groupby(df_train['Sex']).mean().plot(kind='bar')

In [16]:
hist = df_train['Age'].hist(bins=30)

In [17]:
df_train[target].groupby(pd.cut(df_train["Age"], 20)).mean().plot(kind='bar')

In [18]:
df_train[target].groupby(pd.cut(df_train["Fare"], [0,5,10,20,40,70,100,1000])).mean().plot(kind='bar')

In [19]:
df_train[target].groupby(df_train["Pclass"]).mean().plot(kind='bar')

# ***Feature Engineering***

In [20]:
def add_title(df):
    df['first_name'] = df['Name'].str.split(',|\\.', expand = True)[2]
    df['first_name'] = df['first_name'].str.strip()

    df['last_name'] = df['Name'].str.split(',|\\.', expand = True)[0]
    df['last_name'] = df['last_name'].str.strip()

    df['title'] = df['Name'].str.split(',|\\.', expand = True)[1]
    df['title'] = df['title'].str.strip()
    
add_title(df_train)
add_title(df_test)

In [21]:
status_map = {
    'Capt': 'Military',
    'Col': 'Military',
    'Don': 'Noble',
    'Dona': 'Noble',
    'Dr': 'Dr',
    'Jonkheer': 'Noble',
    'Lady': 'Noble',
    'Major': 'Military',
    'Master': 'Common',
    'Miss': 'Common',
    'Mlle': 'Common',
    'Mme': 'Common',
    'Mr': 'Common',
    'Mrs': 'Common',
    'Ms': 'Common',
    'Rev': 'Clergy',
    'Sir': 'Noble',
    'the Countess': 'Noble',
}

def add_social_status(df):
    df['social_status'] = df['title'].map(status_map)
    
add_social_status(df_train)
add_social_status(df_test)

In [22]:
def add_family_members(df):
    df['family_members'] = df['Parch'] + df['SibSp']

add_family_members(df_train)
add_family_members(df_test)

In [23]:
def add_deck_1_and_2(df):
    decks = df['Cabin'].str.replace('[0-9]', '').str.split(' ', expand=True)
    df['deck'] = decks[0]
    df['deck_2'] = decks[1]

add_deck_1_and_2(df_train)
add_deck_1_and_2(df_test)

In [24]:
def add_len_features(df):
    df['name_len'] = df['Name'].apply(lambda x: len(x))
    df['ticket_len'] = df['Ticket'].apply(lambda x: len(x))
    
add_len_features(df_train)
add_len_features(df_test)

In [25]:
df_train[target].groupby(df_train['social_status']).mean().plot(kind='bar')

In [26]:
df_train[target].groupby(df_train['family_members']).mean().plot(kind='bar')

# ***Data Augmentation***

In [27]:
df_train.isnull().any()

In [28]:
def augment_age(df):
    df['age_available'] = ~df['Age'].isnull()
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
augment_age(df_train)
augment_age(df_test)

In [29]:
def augment_deck_1_and_2(df):
    df['deck'] = df['deck'].fillna('NA')
    df['deck_2'] = df['deck_2'].fillna('NA')
    
augment_deck_1_and_2(df_train)
augment_deck_1_and_2(df_test)

In [30]:
def augment_embarked(df):
   df['Embarked'] = df['Embarked'].fillna('NA') 

augment_embarked(df_train)
augment_embarked(df_test)

In [31]:
def augment_fare(df):
    df['fare available'] = ~df['Fare'].isnull()
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

augment_fare(df_train)
augment_fare(df_test)

In [32]:
df_train.isnull().any()

In [33]:
needed_columns = (target, "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "title", "social_status", "family_members", "deck", "deck_2", "name_len", "ticket_len", "age_available")

def drop_unneeded_columns(df):
    columns_to_drop = [c for c in df.columns.values if c not in needed_columns]

    df = df.drop(columns_to_drop, axis=1)
    
    return df
    
df_train = drop_unneeded_columns(df_train)
df_test = drop_unneeded_columns(df_test)

In [34]:
categorical_columns = ("Sex","Pclass","Embarked","title","social_status","deck", "deck_2", "age_available")

def encode_categorical_features(df):
    for column in categorical_columns:
        df = pd.concat((df, pd.get_dummies(df[column], prefix=column)), axis=1)
        df = df.drop(column, axis=1)
    
    return df
        
df_train = encode_categorical_features(df_train)
df_test = encode_categorical_features(df_test)

# ***SPLITTING TEST/TRAIN DATA***

In [35]:
Y_train = df_train[target]

In [36]:
columns_to_drop_from_train = set(df_train.columns.values) - set(df_test.columns.values)
columns_to_drop_from_test = set(df_test.columns.values) - set(df_train.columns.values)

In [37]:
df_train = df_train.drop(columns_to_drop_from_train, axis=1)
df_test = df_test.drop("title_Dona", axis=1)

In [38]:
X_train = df_train.copy()

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size = test_data_split, random_state=random_seed)

In [40]:
scaler_x = MinMaxScaler((-1,1))
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

# ***RANDOM FOREST MODEL***

In [41]:
classifier = RandomForestClassifier(n_estimators=50, criterion='entropy')
classifier.fit(X_train, y_train)

In [42]:
rdmf_score = classifier.score(X_test, y_test)
rdmf_score_tr = classifier.score(X_train, y_train)

print(rdmf_score)
print(rdmf_score_tr)

# ***TEST DATA PREDICTION***

In [43]:
test_prediction = classifier.predict(df_test)

In [44]:
df_result["Survived"] = test_prediction
df_result = df_result.loc[:, ["PassengerId","Survived"]]

In [45]:
df_result.to_csv("submission.csv",index=False)

In [47]:
df_result