In [97]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter

In [98]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [99]:
survived = train['Survived']
train.drop(['Survived'], axis=1, inplace=True)

In [100]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])
train = train.drop(outliers_to_drop, axis = 0).reset_index(drop=True)

  interpolation=interpolation)


In [101]:
ntrain = train.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data = all_data.fillna(np.nan)
all_data = all_data

In [102]:
all_data["Fare"] = all_data["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
all_data["Embarked"] = all_data["Embarked"].fillna("S")
all_data["Sex"] = all_data["Sex"].map({"male": 0, "female":1})

index_NaN_age = list(all_data["Age"][all_data["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = all_data["Age"].median()
    age_pred = all_data["Age"][((all_data['SibSp'] == all_data.iloc[i]["SibSp"]) & (all_data['Parch'] == all_data.iloc[i]["Parch"]) & (all_data['Pclass'] == all_data.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        all_data['Age'].iloc[i] = age_pred
    else :
        all_data['Age'].iloc[i] = age_med

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [103]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in all_data["Name"]]
all_data["Title"] = pd.Series(dataset_title)

all_data["Title"] = all_data["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
all_data["Title"] = all_data["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
all_data["Title"] = all_data["Title"].astype(int)

In [104]:
all_data.drop(labels = ["Name"], axis = 1, inplace = True)
all_data["Fsize"] = all_data["SibSp"] + all_data["Parch"] + 1

In [105]:
all_data['Single'] = all_data['Fsize'].map(lambda s: 1 if s == 1 else 0)
all_data['SmallF'] = all_data['Fsize'].map(lambda s: 1 if  s == 2  else 0)
all_data['MedF'] = all_data['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
all_data['LargeF'] = all_data['Fsize'].map(lambda s: 1 if s >= 5 else 0)

In [106]:
all_data = pd.get_dummies(all_data, columns = ["Title"])
all_data = pd.get_dummies(all_data, columns = ["Embarked"], prefix="Em")

In [107]:
all_data["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in all_data['Cabin'] ])
all_data = pd.get_dummies(all_data, columns = ["Cabin"],prefix="Cabin")

In [108]:
Ticket = []
for i in list(all_data.Ticket):
    if not i.isdigit() :
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
    else:
        Ticket.append("X")
        
all_data["Ticket"] = Ticket

In [109]:
all_data = pd.get_dummies(all_data, columns = ["Ticket"], prefix="T")

In [110]:
all_data["Pclass"] = all_data["Pclass"].astype("category")
all_data = pd.get_dummies(all_data, columns = ["Pclass"],prefix="Pc")

In [111]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [112]:
train['Survived'] = survived

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [113]:
train.to_csv('titanic/clean_train.csv', index=False)
test.to_csv('titanic/clean_test.csv', index=False)