In [294]:
import numpy as np
import scipy
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

TITANIC_PATH = "datasets/Titanic/train.csv"


def load_titanic_data(titanic_path = TITANIC_PATH):
    return pd.read_csv(TITANIC_PATH)

tit_data = load_titanic_data()
#print(tit_data.info)

#separating labels and training, randomizing the sequence of the samples
tit_data2 = tit_data.copy()
X_train_df = (tit_data2.drop(["Survived", "Name"], axis = 1))
X_train = X_train_df.values
y_train = (tit_data["Survived"]).values
shuffle_index = np.random.permutation(891)    #0 to 890
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


cols = X_train_df.columns
X_train = pd.DataFrame(X_train, columns = cols)


#making dummy variables for the string features
string_feats = X_train.iloc[:, [2, 9]]
gd = pd.get_dummies(string_feats, prefix = ['Sex', 'Embarked'])


#joining dummies and originals
X_train.drop(["Sex", "Embarked", "Cabin", "Ticket"], axis = 1, inplace = True)
X_train = pd.concat([X_train, gd], axis = 1 )


#putting median age in the Nans
from sklearn.impute import SimpleImputer
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imp = imp_med.fit_transform(X_train)
X_train = pd.DataFrame(X_train_imp, columns = X_train.columns)
print(X_train.columns)

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [296]:
X_train.drop(["PassengerId"], axis = 1, inplace = True)


from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

sgd_clf = SGDClassifier()
sgd_scrs = cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring = "accuracy")
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
print("(SGD non scaled)", sgd_scrs)

#using standard scalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
sgd_scrs_scaled = cross_val_score(sgd_clf, X_train_scaled, y_train, cv = 3, scoring = "accuracy")
print("(SGD scaled)", sgd_scrs_scaled)

#using randomforestclassifier unscaled
forest_clf = RandomForestClassifier(random_state = 42)
frst_scrs = cross_val_score(forest_clf, X_train, y_train, cv = 3, scoring = "accuracy")
print("(Random Forest unscaled)", frst_scrs)

#using randomforestclassifier scaled
forest_clf = RandomForestClassifier(random_state = 42)
frst_scrs = cross_val_score(forest_clf, X_train_scaled, y_train, cv = 3, scoring = "accuracy")
print("(Random Forest scaled)", frst_scrs)


# (SGD non scaled) [0.41750842 0.61279461 0.3973064 ]
# (SGD scaled) [0.73063973 0.75757576 0.65656566]
# (Random Forest unscaled) [0.83164983 0.7979798  0.81144781]
# (Random Forest scaled) [0.83501684 0.79461279 0.81481481]

                        
# after removing age and sibsp
# (SGD non scaled) [0.38383838 0.61952862 0.66666667]
# (SGD scaled) [0.4040404  0.72727273 0.76430976]
# (Random Forest unscaled) [0.8013468  0.76094276 0.79124579]
# (Random Forest scaled) [0.8013468  0.76094276 0.79124579

#after removing passengerid
# (SGD non scaled) [0.75420875 0.71043771 0.65993266]
# (SGD scaled) [0.5993266  0.73737374 0.79461279]
# (Random Forest unscaled) [0.81481481 0.77777778 0.78114478]
# (Random Forest scaled) [0.81481481 0.77777778 0.78114478]


(SGD non scaled) [0.75420875 0.71043771 0.65993266]
(SGD scaled) [0.5993266  0.73737374 0.79461279]
(Random Forest unscaled) [0.81481481 0.77777778 0.78114478]
(Random Forest scaled) [0.81481481 0.77777778 0.78114478]




In [291]:
print(type(y_train))
a = pd.concat([X_train, pd.Series(y_train)], axis = 1)
print(a)
a.corr()

<class 'numpy.ndarray'>
     PassengerId  Pclass   Age  SibSp  Parch      Fare  Sex_female  Sex_male  \
0          317.0     2.0  24.0    1.0    0.0   26.0000         1.0       0.0   
1          584.0     1.0  36.0    0.0    0.0   40.1250         0.0       1.0   
2          293.0     2.0  36.0    0.0    0.0   12.8750         0.0       1.0   
3           49.0     3.0  28.0    2.0    0.0   21.6792         0.0       1.0   
4          873.0     1.0  33.0    0.0    0.0    5.0000         0.0       1.0   
5          710.0     3.0  28.0    1.0    1.0   15.2458         0.0       1.0   
6           33.0     3.0  28.0    0.0    0.0    7.7500         1.0       0.0   
7          395.0     3.0  24.0    0.0    2.0   16.7000         1.0       0.0   
8          615.0     3.0  35.0    0.0    0.0    8.0500         0.0       1.0   
9          113.0     3.0  22.0    0.0    0.0    8.0500         0.0       1.0   
10         624.0     3.0  21.0    0.0    0.0    7.8542         0.0       1.0   
11         440.0

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,0
PassengerId,1.0,-0.035144,0.034212,-0.057527,-0.001652,0.012658,-0.042939,0.042939,-0.001205,-0.033606,0.022148,-0.005007
Pclass,-0.035144,1.0,-0.339898,0.083081,0.018443,-0.5495,-0.1319,0.1319,-0.243292,0.221009,0.08172,-0.338481
Age,0.034212,-0.339898,1.0,-0.233296,-0.172482,0.096688,-0.081163,0.081163,0.030248,-0.031415,-0.014665,-0.06491
SibSp,-0.057527,0.083081,-0.233296,1.0,0.414838,0.159651,0.114631,-0.114631,-0.059528,-0.026354,0.070941,-0.035322
Parch,-0.001652,0.018443,-0.172482,0.414838,1.0,0.216225,0.245489,-0.245489,-0.011069,-0.081228,0.063036,0.081629
Fare,0.012658,-0.5495,0.096688,0.159651,0.216225,1.0,0.182333,-0.182333,0.269335,-0.117216,-0.166603,0.257307
Sex_female,-0.042939,-0.1319,-0.081163,0.114631,0.245489,0.182333,1.0,-1.0,0.082853,0.074115,-0.125722,0.543351
Sex_male,0.042939,0.1319,0.081163,-0.114631,-0.245489,-0.182333,-1.0,1.0,-0.082853,-0.074115,0.125722,-0.543351
Embarked_C,-0.001205,-0.243292,0.030248,-0.059528,-0.011069,0.269335,0.082853,-0.082853,1.0,-0.148258,-0.778359,0.16824
Embarked_Q,-0.033606,0.221009,-0.031415,-0.026354,-0.081228,-0.117216,0.074115,-0.074115,-0.148258,1.0,-0.496624,0.00365


In [245]:
#a = range(10)
