### <font color=lime>Train-Test Split Procedure in Scikit-Learn</font>
### <font color=lime>Common split percentages include:</font>
* <font color=pink>Train: 80%, Test: 20%</font>
* <font color=pink>Train: 67%, Test: 33%</font>
* <font color=pink>Train: 50%, Test: 50%</font>

In [None]:
# split a dataset into train and test sets
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
# create dataset
X, y = make_blobs(n_samples=1000)
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.67, random_state=10)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# summarize first 5 rows
print(X_train[:5, :])
# split again, and we should see the same split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)
# summarize first 5 rows
print(X_train[:5, :])
print(y_train[0:5])


In [None]:
# split imbalanced dataset into train and test sets without stratification
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# create dataset
X, y = make_classification(n_samples=100, weights=[0.94], flip_y=0, random_state=1)
print(Counter(y))
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=1)
print(Counter(y_train))
print(Counter(y_test))

In [None]:
# split imbalanced dataset into train and test sets with stratification
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# create dataset
X, y = make_classification(n_samples=100, weights=[0.94], flip_y=0, random_state=1)
print(Counter(y))
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=1, stratify=y)
print(Counter(y_train))
print(Counter(y_test))

### <font color=lime>Train-Test Split for Classification</font>

In [None]:
import pandas as pd 

# summarize the sonar dataset
from pandas import read_csv
# load dataset

dataframe = read_csv("..\CSV\sonar.csv", header=None)
# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
print(data.shape, X.shape, y.shape)

In [None]:
...
# split into train test sets
# can use test_size or train_size, random_state is like seeding gives you same data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
...
# fit the model
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

In [None]:
...
# make predictions
yhat = model.predict(X_test)
# evaluate predictions
acc = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % acc)

### <font color=lime>Train-Test Split for Regression</font>

In [None]:

# train-test split evaluation random forest on the housing dataset
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


In [None]:
dataframe = pd.read_csv("..\CSV\housing.csv",header=None)
housingdata = dataframe.values
print(housingdata.shape)

In [None]:
...
# split into inputs and outputs
X, y = housingdata[:, :-1], housingdata[:, -1]
print(housingdata.shape, X.shape, y.shape)

In [None]:
#X_train, y_train, X_test, y_test = train_test_split(X,y,test_size=0.33,random_state=3)

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.67,random_state=3)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# fit the model
model = RandomForestRegressor(random_state=1)
model.fit(X_train, y_train)
# make predictions
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print(mae)
print('MAE: %.3f' % mae)