## Experiment #4 - DecisionTreeClassifier 

## Train and Test Data Preprocessing

In [469]:
%matplotlib inline
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [470]:
data = pd.read_csv(r'C:\Users\apaiu\GitHub Repositories\Tekwill-Machine-Learning-Course\Data Sources\titanic_kaggle_train.csv')

In [471]:
test_data = pd.read_csv(r'C:\Users\apaiu\GitHub Repositories\Tekwill-Machine-Learning-Course\Data Sources\titanic_kaggle_test.csv')

In [472]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,223596,13.5,,S
1,737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48.0,1,3,W./C. 6608,34.375,,S
2,370,1,1,"Aubart, Mme. Leontine Pauline",female,24.0,0,0,PC 17477,69.3,B35,C
3,133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47.0,1,0,A/5. 3337,14.5,,S
4,415,1,3,"Sundman, Mr. Johan Julian",male,44.0,0,0,STON/O 2. 3101269,7.925,,S


In [473]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  623 non-null    int64  
 1   Survived     623 non-null    int64  
 2   Pclass       623 non-null    int64  
 3   Name         623 non-null    object 
 4   Sex          623 non-null    object 
 5   Age          501 non-null    float64
 6   SibSp        623 non-null    int64  
 7   Parch        623 non-null    int64  
 8   Ticket       623 non-null    object 
 9   Fare         623 non-null    float64
 10  Cabin        135 non-null    object 
 11  Embarked     622 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 58.5+ KB


In [474]:
#Data checks for Train Data
#Checking if there is any null values within the dataframe - THESE COLUMNS HAVE NULLS ['Age', 'Cabin', 'Embarked']
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            122
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          488
Embarked         1
dtype: int64

In [475]:
#Filling NULL values with a constant (Unknown)
data['Age'] = data['Age'].fillna(0)
data['Cabin'] = data['Cabin'].fillna('Unknown')
data['Embarked'] = data['Embarked'].fillna('Unknown')

In [476]:
#No null values present anymore after corrections
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [477]:
#Checking if there are any duplicated rows - NO DUPLICATES IDENTIFIED
data[data.duplicated()].sum()

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64

In [478]:
#Transforming ['Sex'] categorical column into a numeric one
dictt = {"female": 0, "male": 1}
data['Sex'] = data['Sex'].map(dictt)

In [479]:
data['Sex'].value_counts()

1    402
0    221
Name: Sex, dtype: int64

In [480]:
#Transforming [Cabin] & [Embarked] feature into numeric ones:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data['Cabin'])
# le.classes_
data['Cabin'] = le.transform(data['Cabin']) 

In [481]:
le.fit(data['Embarked'])
data['Embarked'] = le.transform(data['Embarked']) 

In [482]:
data['Embarked'].value_counts(), data['Cabin'].value_counts()

(2    448
 0    121
 1     53
 3      1
 Name: Embarked, dtype: int64,
 106    488
 37       4
 51       3
 100      3
 69       2
       ... 
 53       1
 24       1
 40       1
 52       1
 38       1
 Name: Cabin, Length: 107, dtype: int64)

In [483]:
#Collecting X and Y
y = data['Survived'].values
X = data.drop(['Survived', 'Name', 'Ticket', 'PassengerId'], axis=1).values #Eliminated [Name] and [Ticket], [PassengerId] columns due to irrelevancy

In [484]:
X, y

(array([[  2.    ,   0.    ,  45.    , ...,  13.5   , 106.    ,   2.    ],
        [  3.    ,   0.    ,  48.    , ...,  34.375 , 106.    ,   2.    ],
        [  1.    ,   0.    ,  24.    , ...,  69.3   ,  18.    ,   0.    ],
        ...,
        [  3.    ,   1.    ,  25.    , ...,   0.    , 106.    ,   2.    ],
        [  3.    ,   1.    ,   0.    , ...,   7.8958, 106.    ,   2.    ],
        [  1.    ,   0.    ,  53.    , ...,  51.4792,  38.    ,   2.    ]]),
 array([1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
        0, 0, 1, 

## Working with TEST data set

In [485]:
#Data checks
#Checking if there is any null values within the dataframe - THESE COLUMNS HAVE NULLS ['Age', 'Cabin', 'Embarked']
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  268 non-null    int64  
 1   Pclass       268 non-null    int64  
 2   Name         268 non-null    object 
 3   Sex          268 non-null    object 
 4   Age          213 non-null    float64
 5   SibSp        268 non-null    int64  
 6   Parch        268 non-null    int64  
 7   Ticket       268 non-null    object 
 8   Fare         268 non-null    float64
 9   Cabin        69 non-null     object 
 10  Embarked     267 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 23.2+ KB


In [486]:
#Checking if there is any null values within the dataframe - THESE COLUMNS HAVE NULLS ['Age', 'Cabin', 'Embarked']
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             55
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          199
Embarked         1
dtype: int64

In [487]:
#Filling NULL values with a constant (Unknown)
test_data['Age'] = test_data['Age'].fillna(0)
test_data['Cabin'] = test_data['Cabin'].fillna('Unknown')
test_data['Embarked'] = test_data['Embarked'].fillna('Unknown')

In [488]:
#No null values present anymore after corrections
test_data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [489]:
#Checking if there are any duplicated rows - NO DUPLICATES IDENTIFIED
test_data[test_data.duplicated()].sum()

PassengerId    0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64

In [490]:
#Transforming ['Sex'] categorical column into a numeric one
dictt = {"female": 0, "male": 1}
test_data['Sex'] = test_data['Sex'].map(dictt)

In [491]:
test_data['Sex'].value_counts()

1    175
0     93
Name: Sex, dtype: int64

In [492]:
#Transforming [Cabin] & [Embarked] feature into numeric ones:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(test_data['Cabin'])
# le.classes_
test_data['Cabin'] = le.transform(test_data['Cabin']) 

In [493]:
le.fit(test_data['Embarked'])
test_data['Embarked'] = le.transform(test_data['Embarked']) 

In [494]:
test_data['Embarked'].value_counts(), test_data['Cabin'].value_counts()

(2    196
 0     47
 1     24
 3      1
 Name: Embarked, dtype: int64,
 63    199
 51      2
 62      2
 37      2
 42      2
      ... 
 17      1
 1       1
 43      1
 6       1
 55      1
 Name: Cabin, Length: 64, dtype: int64)

In [495]:
#Collecting X 
X_test_data = test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1).values #Eliminated [Name] and [Ticket], [PassengerId] columns due to irrelevancy

In [496]:
X_test_data

array([[ 1.    ,  1.    , 61.    , ..., 32.3208, 47.    ,  2.    ],
       [ 3.    ,  1.    , 26.    , ..., 18.7875, 63.    ,  0.    ],
       [ 3.    ,  1.    , 42.    , ...,  7.65  , 58.    ,  2.    ],
       ...,
       [ 3.    ,  1.    , 20.    , ...,  7.8542, 63.    ,  2.    ],
       [ 1.    ,  0.    , 39.    , ..., 79.65  , 55.    ,  2.    ],
       [ 3.    ,  1.    , 17.    , ...,  8.6625, 63.    ,  2.    ]])

In [497]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

## Decision Tree Classifier Model

In [453]:
#Predicting Results on #Model from Experiment #4:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier(max_depth=5, random_state=0)

#Training data from train.csv
X_train = X
y_train = y

model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print("Accuracy in train set: ", accuracy_score(y_train_pred, y_train))

y_test_pred = model.predict(X_test)
print("Accuracy in test: ", accuracy_score(y_test_pred, y_test))

Accuracy in train set:  0.8667736757624398
Accuracy in test:  0.888


In [454]:
y_test_pred = model.predict(X_test_data)

In [455]:
#Creating DataFrame as per expected solution for Kaggle

#Reshape predicted values
y_test_pred_resh = y_test_pred.reshape(y_test_pred.shape[0], -1)

In [463]:
PassengerId = np.array(test_data['PassengerId'])
PassengerId_Reshaped = PassengerId.reshape(PassengerId.shape[0], -1)

In [464]:
#Merge X_test PassengerID & predicted value 
merged_info = np.concatenate((PassengerId_Reshaped, y_test_pred_resh), axis=1)

In [465]:
#Create datamart
final = pd.DataFrame(data = merged_info, columns = ['PassengerId', 'Survived'])

In [466]:
#Converting all columns from float to int
final = final.astype(int)

In [467]:
final

Unnamed: 0,PassengerId,Survived
0,626,1
1,208,0
2,700,0
3,622,1
4,76,0
...,...,...
263,235,0
264,563,0
265,92,0
266,559,1


In [468]:
#Exporting the dataframe to a csv file
final.to_csv('DecisionTreeClassifier_EXP_05.csv', index=False)

## **To do:**
- De vazut cum se mai poate de facut data pre-processing:

     - De vazut cum se poate de facut standardizarea
     - Normalizarea
     - Sklearn pentru data pre-processing features categorice
     
    
- De incercat diferiti algoritmi cu datele cum sunt dupa pre-processarea curenta
- De incercat diferiti algoritmi cu datele cum sunt dupa pre-processarea mai buna + standardizare + normalizare, etc.

## Experiment #6 - Logistic Regression Model

In [499]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, plot_roc_curve, classification_report, accuracy_score, confusion_matrix, recall_score, precision_score
import matplotlib.pyplot as plt
import seaborn as sns

scaler = StandardScaler()
lr = LogisticRegression()
model1 = Pipeline([('standardize', scaler),
                    ('log_reg', lr)])

In [500]:
model1.fit(X_train, y_train)

Pipeline(steps=[('standardize', StandardScaler()),
                ('log_reg', LogisticRegression())])

In [501]:
y_train_hat = model1.predict(X_train)
y_train_hat_probs = model1.predict_proba(X_train)[:,1]

train_accuracy = accuracy_score(y_train, y_train_hat)*100
train_auc_roc = roc_auc_score(y_train, y_train_hat_probs)*100

print('Training AUC: %.4f %%' % train_auc_roc)

print('Training accuracy: %.4f %%' % train_accuracy)

print('Recall score: %.4f %%' % (recall_score(y_train, y_train_hat)*100))

print('Precision score: %.4f %%' % (precision_score(y_train, y_train_hat)*100))

Training AUC: 82.0848 %
Training accuracy: 78.1124 %
Recall score: 66.4894 %
Precision score: 73.0994 %


In [502]:
y_test_hat = model1.predict(X_test)
y_test_hat_probs = model1.predict_proba(X_test)[:,1]

test_accuracy = accuracy_score(y_test, y_test_hat)*100
test_auc_roc = roc_auc_score(y_test, y_test_hat_probs)*100

print('Confusion matrix:\n', confusion_matrix(y_test, y_test_hat))

print('Testing AUC: %.4f %%' % test_auc_roc)

print('Testing accuracy: %.4f %%' % test_accuracy) 

print('Recall score: %.4f %%' % (recall_score(y_train, y_train_hat)*100))

print('Precision score: %.4f %%' % (precision_score(y_train, y_train_hat)*100))

Confusion matrix:
 [[63  8]
 [14 40]]
Testing AUC: 89.9844 %
Testing accuracy: 82.4000 %
Recall score: 66.4894 %
Precision score: 73.0994 %


In [506]:
y_test_hat_final = model1.predict(X_test_data)

In [511]:
y_test_pred_resh

array([[0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
    

In [509]:
y_test_pred_resh = y_test_hat_final.reshape(y_test_hat_final.shape[0], -1)

In [512]:
PassengerId = np.array(test_data['PassengerId'])
PassengerId_Reshaped = PassengerId.reshape(PassengerId.shape[0], -1)
merged_info = np.concatenate((PassengerId_Reshaped, y_test_pred_resh), axis=1)
final = pd.DataFrame(data = merged_info, columns = ['PassengerId', 'Survived'])
final = final.astype(int)


In [514]:
final.to_csv('LogisticRegression_EXP_01.csv', index=False)

## Experiment #5 - Data Normalization + Decision Tree Classifier

In [415]:
#Data Normalization
mm_scaler = MinMaxScaler()
X = mm_scaler.fit_transform(X)
X_test = mm_scaler.fit_transform(X_test_data)

In [416]:
y_test_pred = model.predict(X_test)

In [417]:
y_test_pred.shape

(268,)

In [421]:
#Data after normalization
X_test.shape, X_test.shape

((268, 8), (268, 8))

In [736]:
# Experiment #2 - Decission Tree Classifier
#Changed Max_Depth = 10

model2 = DecisionTreeClassifier(max_depth=10, random_state=0)

#Training data from train.csv
X_train = X
y_train = y

model2.fit(X_train, y_train)
y_train_pred = model2.predict(X_train)
print("Accuracy in train set: ", accuracy_score(y_train_pred, y_train))

Accuracy in train set:  0.9582664526484751


In [737]:
#Predicting Results on #Model from Experiment #2:

y_test_pred = model2.predict(X_test)

In [738]:
y_test_pred_resh = y_test_pred.reshape(y_test_pred.shape[0], -1)
merged_info = np.concatenate((X_test[:,[0]], y_test_pred_resh), axis=1)
final = pd.DataFrame(data = merged_info, columns = ['PassengerId', 'Survived'])
final = final.astype(int)

In [739]:
#Exporting the dataframe to a csv file
final.to_csv('DecisionTreeClassifier_EXP_02.csv', index=False)

In [661]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

## Experiment #4 - Decision Tree Classifier + eliminating PassengerID

In [221]:
#Collecting X and Y
y = data['Survived'].values
X = data.drop(['Survived', 'Name', 'Ticket', 'PassengerId'], axis=1).values #Eliminated [Name] and [Ticket] columns due to irrelevancy

In [222]:
X, y

(array([[  2.    ,   0.    ,  45.    , ...,  13.5   , 106.    ,   2.    ],
        [  3.    ,   0.    ,  48.    , ...,  34.375 , 106.    ,   2.    ],
        [  1.    ,   0.    ,  24.    , ...,  69.3   ,  18.    ,   0.    ],
        ...,
        [  3.    ,   1.    ,  25.    , ...,   0.    , 106.    ,   2.    ],
        [  3.    ,   1.    ,   0.    , ...,   7.8958, 106.    ,   2.    ],
        [  1.    ,   0.    ,  53.    , ...,  51.4792,  38.    ,   2.    ]]),
 array([1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
        0, 0, 1, 

In [271]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [270]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model4 = DecisionTreeClassifier(max_depth=5, random_state=0)

model4.fit(X_train, y_train)
y_train_pred = model4.predict(X_train)
print("Accuracy in train: ", accuracy_score(y_train_pred, y_train))

y_test_pred = model4.predict(X_test)
print("Accuracy in test: ", accuracy_score(y_test_pred, y_test))

Accuracy in train:  0.8654618473895582


ValueError: Found input variables with inconsistent numbers of samples: [268, 125]

In [225]:
#Collecting X from test dataset
X_test = test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1).values #Eliminated [Name] and [Ticket], [PassengerId] columns due to irrelevancy

In [205]:
X_test

array([[ 1.    ,  1.    , 61.    , ..., 32.3208, 47.    ,  2.    ],
       [ 3.    ,  1.    , 26.    , ..., 18.7875, 63.    ,  0.    ],
       [ 3.    ,  1.    , 42.    , ...,  7.65  , 58.    ,  2.    ],
       ...,
       [ 3.    ,  1.    , 20.    , ...,  7.8542, 63.    ,  2.    ],
       [ 1.    ,  0.    , 39.    , ..., 79.65  , 55.    ,  2.    ],
       [ 3.    ,  1.    , 17.    , ...,  8.6625, 63.    ,  2.    ]])

In [266]:
y_test_pred = model4.predict(X_test)

In [267]:
y_test_pred_resh = y_test_pred.reshape(y_test_pred.shape[0], -1)
merged_info = np.concatenate((X_test[:,[0]], y_test_pred_resh), axis=1)
final = pd.DataFrame(data = merged_info, columns = ['PassengerId', 'Survived'])
final = final.astype(int)

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0], dtype=int64)

In [262]:
final

Unnamed: 0,PassengerId,Survived
0,1,0
1,3,0
2,3,0
3,1,1
4,3,0
...,...,...
263,2,0
264,2,0
265,3,0
266,1,1


In [252]:
test_data['PassengerId']

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,626,1,"Sutton, Mr. Frederick",1,61.0,0,0,36963,32.3208,47,2
1,208,3,"Albimona, Mr. Nassef Cassem",1,26.0,0,0,2699,18.7875,63,0
2,700,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",1,42.0,0,0,348121,7.6500,58,2
3,622,1,"Kimball, Mr. Edwin Nelson Jr",1,42.0,1,0,11753,52.5542,38,2
4,76,3,"Moen, Mr. Sigurd Hansen",1,25.0,0,0,348123,7.6500,59,2
...,...,...,...,...,...,...,...,...,...,...,...
263,235,2,"Leyson, Mr. Robert William Norman",1,24.0,0,0,C.A. 29566,10.5000,63,2
264,563,2,"Norman, Mr. Robert Douglas",1,28.0,0,0,218629,13.5000,63,2
265,92,3,"Andreasson, Mr. Paul Edvin",1,20.0,0,0,347466,7.8542,63,2
266,559,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",0,39.0,1,1,110413,79.6500,55,2
