# Cross Validation - Train Test Split

Performing model cross validation within Scikit Learn

In [1]:
import sklearn
import pandas as pd
import numpy as np

In [2]:
# We read the file into the data frame in one go
# Pandas always assumes that the first row is the header
df = pd.read_csv("data/titanic/train.csv") 

## Train Test Split

In [3]:
from sklearn.model_selection import train_test_split

### Sometimes we separate Features X and Labels Y First

In [8]:
X = df.drop(['Survived'],axis=1)
y = df['Survived']

In [9]:
# Now we split the data with 30% for testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1278)

In [10]:
print("Training data:", X_train.shape, y_train.shape,)
print("Test data:", X_test.shape, y_test.shape)

Training data: (623, 11) (623,)
Test data: (268, 11) (268,)


In [11]:
X_train.shape

(623, 11)

In [12]:
X_test.shape

(268, 11)

In [13]:
y_train.shape

(623,)

In [14]:
y_test.shape

(268,)

### Sometimes we Train-Test Split the Data First

In [15]:
# Now we split the data with 30% for testing 
df_train, df_test = train_test_split(df, test_size=0.30, random_state=1278)

In [16]:
X_train = df_train.drop(['Survived'],axis=1)
y_train = df_train['Survived']
X_test = df_test.drop(['Survived'],axis=1)
y_test = df_test['Survived']

In [17]:
print("Training data:", X_train.shape, y_train.shape,)
print("Test data:", X_test.shape, y_test.shape)

Training data: (623, 11) (623,)
Test data: (268, 11) (268,)
