In [42]:
import numpy as np
import scipy
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

TITANIC_PATH = "datasets/Titanic/train.csv"


import warnings
warnings.filterwarnings('ignore')

def load_titanic_data(titanic_path = TITANIC_PATH):
    return pd.read_csv(TITANIC_PATH)

tit_data = load_titanic_data()
#print(tit_data.info)

#separating labels and training, randomizing the sequence of the samples
tit_data2 = tit_data.copy()
X_train_df = (tit_data2.drop(["Survived", "Name"], axis = 1))
X_train = X_train_df.values
y_train = (tit_data["Survived"]).values
shuffle_index = np.random.permutation(891)    #0 to 890
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


cols = X_train_df.columns
X_train = pd.DataFrame(X_train, columns = cols)


#making dummy variables for the string features
string_feats = X_train.iloc[:, [2, 9]]
gd = pd.get_dummies(string_feats, prefix = ['Sex', 'Embarked'])


#joining dummies and originals
X_train.drop(["Sex", "Embarked", "Cabin", "Ticket"], axis = 1, inplace = True)
X_train = pd.concat([X_train, gd], axis = 1 )


#putting median age in the Nans
from sklearn.impute import SimpleImputer
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = imp_med.fit_transform(X_train)
X_train = pd.DataFrame(X_train_imp, columns = X_train.columns)
print(X_train.columns)

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [38]:

# X_train["fam_size"] = X_train["SibSp"] + X_train["Parch"]
# X_train.drop(["PassengerId", "SibSp", "Parch"], axis = 1, inplace = True)


from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

sgd_clf = SGDClassifier()
sgd_scrs = cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring = "accuracy")
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
print("(SGD non scaled)", sgd_scrs)

#using standard scalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
sgd_scrs_scaled = cross_val_score(sgd_clf, X_train_scaled, y_train, cv = 3, scoring = "accuracy")
print("(SGD scaled)", sgd_scrs_scaled)

#using randomforestclassifier unscaled
forest_clf = RandomForestClassifier(random_state = 42)
frst_scrs = cross_val_score(forest_clf, X_train, y_train, cv = 3, scoring = "accuracy")
print("(Random Forest unscaled)", frst_scrs)

#using randomforestclassifier scaled
forest_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
frst_scrs = cross_val_score(forest_clf, X_train_scaled, y_train, cv = 3, scoring = "accuracy")
print("(Random Forest scaled)", frst_scrs)


(SGD non scaled) [0.63299663 0.61952862 0.67003367]
(SGD scaled) [0.71380471 0.75420875 0.74410774]
(Random Forest unscaled) [0.81144781 0.82154882 0.81481481]
(Random Forest scaled) [0.8047138  0.83501684 0.81818182]


In [43]:
from collections import Counter
print(X_train.shape)

#removing otliers from the data
def remove_outlier_rows(f_name_array, given_number):
    outlier_cols = []
    
    for one_col in f_name_array:
        P1 = np.percentile(X_train[one_col], 25)
        P2 = np.percentile(X_train[one_col], 75)
        IQR = P2 - P1
        outlier_step = 1.5 * IQR
       
        indices_outlier_rows = X_train[(X_train[one_col] > P2 + outlier_step) | (X_train[one_col] < P1 - outlier_step)].index
        #print(X_train[(X_train[one_col] > P2 + IQR) | (X_train[one_col] < P1 - IQR)])
        outlier_cols.extend(indices_outlier_rows)
       
    check = Counter(outlier_cols)
    
    rows_to_remove = [k for k,v in check.items() if v > given_number]
    #print((rows_to_remove))
    print(X_train.loc[rows_to_remove]) # Show the outliers rows

    X_train.drop(rows_to_remove, inplace = True)

remove_outlier_rows(["Age", "Parch", "SibSp", "Fare"], 2)
print(X_train.shape)

(891, 11)
     PassengerId  Pclass    Age  SibSp  Parch      Fare  Sex_female  Sex_male  \
29           8.0     3.0   2.00    3.0    1.0   21.0750         0.0       1.0   
51         825.0     3.0   2.00    4.0    1.0   39.6875         0.0       1.0   
64         643.0     3.0   2.00    3.0    2.0   27.9000         1.0       0.0   
82         306.0     1.0   0.92    1.0    2.0  151.5500         0.0       1.0   
216        165.0     3.0   1.00    4.0    1.0   39.6875         0.0       1.0   
234        120.0     3.0   2.00    4.0    2.0   31.2750         1.0       0.0   
330        880.0     1.0  56.00    0.0    1.0   83.1583         1.0       0.0   
374        269.0     1.0  58.00    0.0    1.0  153.4625         1.0       0.0   
490        298.0     1.0   2.00    1.0    2.0  151.5500         1.0       0.0   
597        387.0     3.0   1.00    5.0    2.0   46.9000         0.0       1.0   
603         17.0     3.0   2.00    4.0    1.0   29.1250         0.0       1.0   
651        588.0  

In [4]:
from collections import Counter


def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    print(multiple_outliers)
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
Outliers_to_drop = detect_outliers(X_train,2,["Age","SibSp","Parch","Fare"])