# train,test ,validate using train-test-split


In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm




In [2]:
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape


((150, 4), (150,))

In [3]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=0)

In [4]:

 X_train.shape, y_train.shape

((90, 4), (90,))

In [5]:
X_test.shape, y_test.shape

((60, 4), (60,))

In [6]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9666666666666667

# completing Cross validating

In [7]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [8]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.98 accuracy with a standard deviation of 0.02


In [9]:
from sklearn import metrics
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([0.96658312, 1.        , 0.96658312, 0.96658312, 1.        ])

In [10]:
from sklearn.model_selection import ShuffleSplit
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(clf, X, y, cv=cv)

array([0.97777778, 0.97777778, 1.        , 0.95555556, 1.        ])

In [11]:
def custom_cv_2folds(X):
    n = X.shape[0]
    i = 1
    while i <= 2:
        idx = np.arange(n * (i - 1) / 2, n * i / 2, dtype=int)
        yield idx, idx
        i += 1

custom_cv = custom_cv_2folds(X)
cross_val_score(clf, X, y, cv=custom_cv)

array([1.        , 0.97333333])

In [12]:
from sklearn import preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)

0.9333333333333333

In [13]:
from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
cross_val_score(clf, X, y, cv=cv)

array([0.97777778, 0.93333333, 0.95555556, 0.93333333, 0.97777778])

In [14]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, X, y, scoring=scoring)
sorted(scores.keys())
['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']
scores['test_recall_macro']

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [15]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, X, y, scoring=scoring)
sorted(scores.keys())
['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']
scores['test_recall_macro']

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [16]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, X, y, scoring=scoring)
sorted(scores.keys())
['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']
scores['test_recall_macro']

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

# Obtaining predictions by cross-validation

# K-fold  validation

In [17]:
import numpy as np
from sklearn.model_selection import KFold

X = ["a", "b", "c", "d"]
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1]
[0 1] [2 3]


In [18]:
X = np.array([[0., 0.], [1., 1.], [-1., -1.], [2., 2.]])
y = np.array([0, 1, 0, 1])
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

# Shuffle Split


In [19]:
from sklearn.model_selection import GroupShuffleSplit

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("%s %s" % (train, test))

[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]


# Using cross-validation iterators to split train and test

In [20]:
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

X = np.array([0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001])
y = np.array(["a", "b", "b", "b", "c", "c", "c", "a"])
groups = np.array([1, 1, 2, 2, 3, 3, 4, 4])
train_indx, test_indx = next(GroupShuffleSplit(random_state=7).split(X, y, groups))
X_train, X_test, y_train, y_test = \
      X[train_indx], X[test_indx], y[train_indx], y[test_indx]


In [21]:

np.unique(groups[train_indx]), np.unique(groups[test_indx])


(array([1, 2, 4]), array([3]))

In [22]:
X_train.shape, X_test.shape

((6,), (2,))

# train test split

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [24]:
url = 'https://raw.githubusercontent.com/mGalarnyk/Tutorial_Data/master/King_County/kingCountyHouseData.csv'
df = pd.read_csv(url)
# Selecting columns I am interested in
columns = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','price']
df = df.loc[:, columns]
df.head(10)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,price
0,3,1.0,1180,5650,1.0,221900.0
1,3,2.25,2570,7242,2.0,538000.0
2,2,1.0,770,10000,1.0,180000.0
3,4,3.0,1960,5000,1.0,604000.0
4,3,2.0,1680,8080,1.0,510000.0
5,4,4.5,5420,101930,1.0,1225000.0
6,3,2.25,1715,6819,2.0,257500.0
7,3,1.5,1060,9711,1.0,291850.0
8,3,1.0,1780,7470,1.0,229500.0
9,3,2.5,1890,6560,2.0,323000.0


In [25]:
features = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']
X = df.loc[:, features]
y = df.loc[:, ['price']]

In [26]:
X

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors
0,3,1.00,1180,5650,1.0
1,3,2.25,2570,7242,2.0
2,2,1.00,770,10000,1.0
3,4,3.00,1960,5000,1.0
4,3,2.00,1680,8080,1.0
...,...,...,...,...,...
21608,3,2.50,1530,1131,3.0
21609,4,2.50,2310,5813,2.0
21610,2,0.75,1020,1350,2.0
21611,3,2.50,1600,2388,2.0


In [27]:
y

Unnamed: 0,price
0,221900.0
1,538000.0
2,180000.0
3,604000.0
4,510000.0
...,...
21608,360000.0
21609,400000.0
21610,402101.0
21611,400000.0


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)

In [29]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors
1956,1,1.00,960,40946,1.0
15678,3,1.75,1510,9720,1.0
8729,3,2.25,1670,6414,1.0
19064,3,2.00,2660,192099,1.0
11291,3,1.75,1690,13500,1.0
...,...,...,...,...,...
13123,5,5.00,3960,94089,2.0
19648,2,2.50,1400,1262,2.0
9845,3,2.25,2360,14950,1.0
10799,4,2.00,2370,76665,2.0


In [30]:
X_test

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors
17384,2,1.50,1430,1650,3.0
722,4,3.25,4670,51836,2.0
2680,2,0.75,1440,3700,1.0
18754,2,1.00,1130,2640,1.0
14554,4,2.50,3180,9603,2.0
...,...,...,...,...,...
8709,4,2.25,2130,11900,2.0
12346,3,2.00,1010,14183,1.0
10458,4,2.50,3410,179419,2.0
10894,3,1.50,1180,12324,1.0


In [31]:
y_train

Unnamed: 0,price
1956,420850.0
15678,335000.0
8729,587100.0
19064,495000.0
11291,780000.0
...,...
13123,1180000.0
19648,299000.0
9845,431000.0
10799,411000.0


In [32]:
y_test

Unnamed: 0,price
17384,297000.0
722,1578000.0
2680,562100.0
18754,631500.0
14554,780000.0
...,...
8709,649990.0
12346,390000.0
10458,774950.0
10894,372500.0


In [33]:
reg = DecisionTreeRegressor(max_depth = 2, random_state = 0)

In [34]:
reg.fit(X_train, y_train)

In [35]:
reg.predict(X_test[0:10])

array([ 406622.58288211, 1095030.54807692,  406622.58288211,
        406622.58288211,  657115.94280443,  406622.58288211,
        406622.58288211,  657115.94280443,  657115.94280443,
       1095030.54807692])

In [36]:
X_test.head(1)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors
17384,2,1.5,1430,1650,3.0


In [37]:
reg.predict(X_test.iloc[0].values.reshape(1,-1))



array([406622.58288211])

# Other example

In [38]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
iris=load_iris()
X=iris.data
Y=iris.target
print("Size of Dataset {}".format(len(X)))
logreg=LogisticRegression()
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=42)
logreg.fit(x_train,y_train)
predict=logreg.predict(x_test)
print("Accuracy score on training set is {}".format(accuracy_score(logreg.predict(x_train),y_train)))
print("Accuracy score on test set is {}".format(accuracy_score(predict,y_test)))

Size of Dataset 150
Accuracy score on training set is 0.9619047619047619
Accuracy score on test set is 1.0
