In [1]:
# How To Split data into test and train data

from sklearn import datasets
from sklearn.model_selection import train_test_split

# Load the dataset in as NumPy array

iris = datasets.load_iris()
features = iris.data
labels = iris.target 
print("The length of the entire(Training + Test) array is: ", len(features))

# Split the dataset

features_train, features_test, labels_train, labels_test  = train_test_split(
    features,
    labels,
    test_size = 0.5,
    random_state = 1, # same split or shuffle every time I run the code
    stratify = labels  # keeps the classes balance
)

print("The length of the input array is: ", len(features_train))
#print(features_test)


The length of the entire(Training + Test) array is:  150
The length of the input array is:  75


In [None]:
# Spliting data exercise

from sklearn import datasets
from sklearn.model_selection import train_test_split

# Load in the dataset as a NumPy array

wine = datasets.load_wine()


# Split the dataset into train and test

features = wine.data
labels = wine.target

features_train, features_test, labels_train, labels_test = train_test_split(
    features,
    labels,
    test_size = 0.7,  # 70% for train and 30% for test
    random_state = 1, # split in the same order or shuffle
    stratify = labels # divide  in the same proportion
)

print("The length of the original data is: ", len(labels))
print("The length of the labels train data is: ", len(labels_train))



In [None]:
# Spliting dataset into training and test with different percentages
# for a regression example
# for different test sizes


from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# Load the dataset in
housing =  fetch_california_housing()
 # using a regression dataset
 

# Split the dataset
features = housing.data
labels = housing.target

for size in [0.2, 0.35, 0.75]:
     
    features_train, features_test, labels_train, labels_test = train_test_split(
        features,
        labels,
        test_size = size, # 
        random_state = 1
        
    #stratify = labels  , only works for a classification problem
         
)

    print(f"\nFor test size: {size}")
    print("Total data points: ", len(labels))
    
    print("Training data points: ", len(features_train))
    
    print("Test data points: ", len(features_test))

In [15]:
# Showing a table for different sizes 
# Another regression dataset


from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd  # because we need dataframe for making a table


# Load the dataset in

cancer = datasets.load_breast_cancer()
features = cancer.data
labels = cancer.target

print(len(cancer))



# Split the dataset for five different sizes
for size in [0.1, 0.3, 0.5, 0.8, 0.95]:
    

    features_train, features_test, labels_train, labels_test = train_test_split(
        features, 
        labels,
        test_size = size,
        random_state = 1
)
    results = [] # to create an empty list
    
    #  Save these data in a dictionary format
    #  each dictionary equals one row of table

    results.append({
        "Total Size " : len(labels),
        "Test Size " : size,
        "Training Size " : len(features_train),
        "Test Size " : len(features_test)
    })
    df = pd.DataFrame(results)
    print(df)
    

    # Convert the dictionary to a dataframe so we can display it as a table
      
    


    



8
   Total Size   Test Size   Training Size 
0          569          57             512
   Total Size   Test Size   Training Size 
0          569         171             398
   Total Size   Test Size   Training Size 
0          569         285             284
   Total Size   Test Size   Training Size 
0          569         456             113
   Total Size   Test Size   Training Size 
0          569         541              28


In [20]:
# ANOTHER EX WITH WINE DATA
# X IS THE INPUT, Y IS THE OUTPUT

from sklearn.model_selection import train_test_split
from sklearn import datasets

# load the datasets as a dataframe

wine = datasets.load_wine(as_frame=True)
# print(wine.frame)

# pick X as the input(features) and Y as the output(targets)

X = wine.frame.drop(columns = ['target'])  # remove target from the  input

#  print(X)
Y = wine.frame['target']
# print(Y)

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size = 0.45,
    random_state = 1,
    stratify = Y  # keep the same proportion or class
)

# check shape  i.e (rows=samples, columns = features) for a DF and just rows for a Series

print(X_train.shape) 
print(Y_test.shape)
 



(97, 13)
(81,)
