# Train Valid Vest data split

Tutorial for improve skills: How to create train valid test split by Marcus Mariano

**For more information about Marcus Mariano: [Web site](https://marcusmariano.github.io/mmariano/)**  


## Import Packages

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

sns.set(style="darkgrid", color_codes=True)
%matplotlib inline

## Load dataset

In [2]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## To divide into Dependent and Independent variables (Predictors and Class)

In [3]:
X = iris.iloc[:, 0:4]
y = iris.iloc[:, -1]
X.shape, y.shape

((150, 4), (150,))

In [4]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
y.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object

In [6]:
y.value_counts()

virginica     50
versicolor    50
setosa        50
Name: species, dtype: int64

## Transform categorical variables into discrete numerical variables

In [7]:
#  -----------------------------------------------    
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [8]:
pd.unique(y)

array([0, 1, 2])

## Division of training, validation and testing databases

In [34]:
# Data set division into training, validation and testing.
from sklearn.model_selection import train_test_split

train_ratio = 0.70
valid_ratio = 0.15
test_ratio = 0.15

SEED = 0

# train is now 70% of the entire data set
# test is now 30% of the entire data set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = test_ratio, 
                                                    random_state=SEED)

# Adjusts valid ratio of remaining dataset.
ratio_remaining = 1 - test_ratio
ratio_val_adjusted = valid_ratio / ratio_remaining


print(f"Train 70%  and Test 30%  data: {X_train.shape, X_test.shape}", \
                                        {y_train.shape, y_test.shape})

# test is now 15% of the initial data set
# validation is now 15% of the initial data set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, 
                                                      y_train, 
                                                      test_size = ratio_val_adjusted, 
                                                      random_state=SEED) 


print(f"Train 70%, Valid 15% and Test 15% data: {X_train.shape, X_valid.shape, X_test.shape}",\
                                    {y_train.shape, y_valid.shape, y_test.shape})

Train 70%  and Test 30%  data: ((127, 4), (23, 4)) {(127,), (23,)}
Train 70%, Valid 15% and Test 15% data: ((104, 4), (23, 4), (23, 4)) {(23,), (104,)}


In [16]:
len(y_train) / len(y)

0.6933333333333334

In [17]:
len(y_valid) / len(y)

0.15333333333333332

In [18]:
len(y_test) / len(y)

0.15333333333333332