In [2]:
import pandas as pd
import seaborn as sns
import pylab as plt
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [6]:
#kaggle=False
kaggle=True


datapath="/home/felix/spiced/02_week/data/"

df=pd.read_csv(datapath + "train.csv")

if not kaggle:
    # split data if not using whole test.csv for kaggle
    df_tmp, df_test  = train_test_split(df    ,test_size=0.2, random_state=12)
    df_train, df_val = train_test_split(df_tmp,test_size=0.2, random_state=12)
else:
    df_train=df
    df_test=pd.read_csv(datapath + "test.csv")

def add_title(df):
    df["title"]=df["Name"].transform(lambda x: x.split(",")[1].split()[0].split(".")[0])
    return df

def onehot_titles(df):
    alltitles=df["title"].unique()

    title_map_dict={'Mme': 'Mrs',
                   'Mlle': 'Miss',
                   'Ms': 'Miss',
                   'Mr': 'Mr',
                   'Mrs': 'Mrs',
                   'Miss': 'Miss',
                   'Master': 'Master'}

    for tit in alltitles:
        if not tit in title_map_dict.keys():
            title_map_dict[tit]= "other"
    df['newtitle']=df['title'].map(title_map_dict)
    onehot=pd.get_dummies(df["newtitle"], drop_first=True)
    df=df.join(onehot)
    df=df.drop('newtitle', axis=1)
    return df

def fill_Agenans_from_title(df, df_train=None):
    # Agemeans  per title
    if type(df_train)==type(None):
        print("training df is not given. Use df for means")
        df_train=df
    agemeans_from_title = df_train.groupby('title')['Age'].apply('mean')
    df["new_Age"]=df['Age'].fillna(df['title'].map(agemeans_from_title))
    return df

# define functions for preprocessing (encoding and na-replacement)

def encode_sex(df):
    df["male"]=pd.get_dummies(df['Sex'] , drop_first=True)
    return df

def add_Pclass_sex(df):
    df["class_sex"]=df["Pclass"].transform(str) + "_" + df["Sex"].str[0]
    return df

def fillna_groups(df,groupmean=None):
    '''
    this function fills na of Age based on mean values for groups
    groups are defined by Passenger class_sex (combination of passenger class and sex)
    if groupmean=None: 
         Groupmeans are derived and imposed to nans
         Groupmeans are returned
    if groupmeans dictionary is passed: groupmeans of dictionary are imposed to nan values 

    for test data give groupmeans of train data 
    '''
    if type(groupmean)==type(None):
        groupmean=df.groupby("class_sex")["Age"].mean()
    df['new_Age'] = df['Age'].fillna(df['class_sex'].map(groupmean))
    return df,groupmean

def preprocessing(df,trainmean=None):
    df = encode_sex(df)
    df=add_Pclass_sex(df)
    df,groupmean=fillna_groups(df, groupmean=trainmean)
    return df,groupmean    

def preprocessing2(df):
    df = encode_sex(df)
    df=add_title(df)
    df=onehot_titles(df)
    df=fill_Agenans_from_title(df,df_train)
    return df    



#df_train, trainmean = preprocessing(df_train)
#df_test ,       dum = preprocessing(df_test, trainmean)

df_train = preprocessing2(df_train)
df_test  = preprocessing2(df_test)

In [7]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'male', 'title', 'Miss',
       'Mr', 'Mrs', 'other', 'new_Age'],
      dtype='object')

In [8]:
#define test and train dataset
Xtrain = df_train[['male','Pclass','new_Age', 'SibSp', 'Miss', 'Mr', 'Mrs', 'other' ]] 
Xtest  =  df_test[['male','Pclass','new_Age', 'SibSp', 'Miss', 'Mr', 'Mrs', 'other' ]] 

ytrain = df_train["Survived"]
if not kaggle: ytest  =  df_test["Survived"]


m = RandomForestClassifier(max_depth=3, n_estimators=1000)  # n_estimators is the number of decision trees

m.fit(Xtrain, ytrain)

train_score=m.score(Xtrain,ytrain)

print("Training score :" , train_score)
if not kaggle:
    test_score=m.score(Xtest,ytest)
    print("Testing  score :" , test_score)
else:
    # prepare result.csv file  for submission to kaggle

    pred=m.predict(Xtest)
    df_result=pd.DataFrame()
    df_result["PassengerId"]=df_test["PassengerId"]
    df_result.reset_index(inplace=True)
    df_result["Survived"]=pd.Series(pred)
    df_result.set_index("PassengerId", inplace=True)
    df_result.drop("index", axis=1, inplace=True) 
    df_result.to_csv("result_RF2.csv")

Training score : 0.8170594837261503
