In [17]:
import numpy as np
import scipy
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

TITANIC_PATH = "datasets/Titanic/train.csv"


import warnings
warnings.filterwarnings('ignore')

def load_titanic_data(titanic_path = TITANIC_PATH):
    return pd.read_csv(TITANIC_PATH)

tit_data = load_titanic_data()
#print(tit_data.info)

#separating labels and training, randomizing the sequence of the samples
tit_data2 = tit_data.copy()
X_train_df = (tit_data2.drop(["Survived", "Name"], axis = 1))
X_train = X_train_df.values
y_train = (tit_data["Survived"]).values
shuffle_index = np.random.permutation(891)    #0 to 890
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


cols = X_train_df.columns
X_train = pd.DataFrame(X_train, columns = cols)


#making dummy variables for the string features
string_feats = X_train.iloc[:, [2, 9]]
gd = pd.get_dummies(string_feats, prefix = ['Sex', 'Embarked'])


#joining dummies and originals
X_train.drop(["Sex", "Embarked", "Cabin", "Ticket"], axis = 1, inplace = True)
X_train = pd.concat([X_train, gd], axis = 1 )


#putting median age in the Nans
from sklearn.impute import SimpleImputer
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = imp_med.fit_transform(X_train)
X_train = pd.DataFrame(X_train_imp, columns = X_train.columns)
print(X_train.columns)

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [27]:

# X_train["fam_size"] = X_train["SibSp"] + X_train["Parch"]
# X_train.drop(["PassengerId", "SibSp", "Parch"], axis = 1, inplace = True)


from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

sgd_clf = SGDClassifier()
sgd_scrs = cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring = "accuracy")
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
print("(SGD non scaled)", sgd_scrs)

#using standard scalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
sgd_scrs_scaled = cross_val_score(sgd_clf, X_train_scaled, y_train, cv = 3, scoring = "accuracy")
print("(SGD scaled)", sgd_scrs_scaled)

#using randomforestclassifier unscaled
forest_clf = RandomForestClassifier(random_state = 42)
frst_scrs = cross_val_score(forest_clf, X_train, y_train, cv = 3, scoring = "accuracy")
print("(Random Forest unscaled)", frst_scrs)

#using randomforestclassifier scaled
forest_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
frst_scrs = cross_val_score(forest_clf, X_train_scaled, y_train, cv = 3, scoring = "accuracy")
print("(Random Forest scaled)", frst_scrs)

# from sklearn import svm
# clf = svm.SVC(kernel='linear', C=1)
# svm_scr = cross_val_score(clf, X_train, y_train, cv = 3, scoring = "accuracy")
# print("(SVM)", svm_scr)

# from sklearn.neighbors import KNeighborsClassifier

# knn_clf = KNeighborsClassifier()
# knn_scrs = cross_val_score(knn_clf, X_train, y_train, cv = 3, scoring = "accuracy", n_jobs = -1)
# print("(knn classifier)", knn_scrs)

# y
# (SGD non scaled) [0.41750842 0.61279461 0.3973064 ]
# (SGD scaled) [0.73063973 0.75757576 0.65656566]
# (Random Forest unscaled) [0.83164983 0.7979798  0.81144781]
# (Random Forest scaled) [0.83501684 0.79461279 0.81481481]

                        
# after removing age and sibsp
# (SGD non scaled) [0.38383838 0.61952862 0.66666667]
# (SGD scaled) [0.4040404  0.72727273 0.76430976]
# (Random Forest unscaled) [0.8013468  0.76094276 0.79124579]
# (Random Forest scaled) [0.8013468  0.76094276 0.79124579

#after removing passengerid
# (SGD non scaled) [0.75420875 0.71043771 0.65993266]
# (SGD scaled) [0.5993266  0.73737374 0.79461279]
# (Random Forest unscaled) [0.81481481 0.77777778 0.78114478]
# (Random Forest scaled) [0.81481481 0.77777778 0.78114478]


(SGD non scaled) [0.63636364 0.62289562 0.62962963]
(SGD scaled) [0.7979798  0.76767677 0.69360269]
(Random Forest unscaled) [0.81818182 0.8013468  0.82828283]
(Random Forest scaled) [0.83838384 0.7979798  0.81818182]


In [None]:
#print(type(y_train))
a = pd.concat([X_train, pd.Series(y_train)], axis = 1)
#print(a)
a.corr()

#a["newcol"] = a[]

In [26]:
col = "Age"

outlier_list_col = X_train[(X_train[col] < 10)].index
outlier_list_col 

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
3,298.0,1.0,2.00,1.0,2.0,151.5500,1.0,0.0,0.0,0.0,1.0
58,184.0,2.0,1.00,2.0,1.0,39.0000,0.0,1.0,0.0,0.0,1.0
66,279.0,3.0,7.00,4.0,1.0,29.1250,0.0,1.0,0.0,1.0,0.0
68,446.0,1.0,4.00,0.0,2.0,81.8583,0.0,1.0,0.0,0.0,1.0
83,375.0,3.0,3.00,3.0,1.0,21.0750,1.0,0.0,0.0,0.0,1.0
99,306.0,1.0,0.92,1.0,2.0,151.5500,0.0,1.0,0.0,0.0,1.0
133,59.0,2.0,5.00,1.0,2.0,27.7500,1.0,0.0,0.0,0.0,1.0
145,542.0,3.0,9.00,4.0,2.0,31.2750,1.0,0.0,0.0,0.0,1.0
164,851.0,3.0,4.00,4.0,2.0,31.2750,0.0,1.0,0.0,0.0,1.0
180,778.0,3.0,5.00,0.0,0.0,12.4750,1.0,0.0,0.0,0.0,1.0
