# Lab 7: Evaluation
### Amir Ali (317554)

## Task 1

#### Import required libraries

In [30]:
import numpy as np
from scipy import stats
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

### Generate Data

In [31]:
# original data
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [32]:
# artificial data
def generate_dataset(b, k, n):
    beta_size = 5 + k
    X = np.random.multivariate_normal(np.zeros([beta_size]), np.eye(beta_size), size=n)
    beta = np.concatenate((np.full(5, b),np.zeros(k))).reshape(-1,1)
    p = 1/(1 + np.exp((-(X @ beta))))
    Y = stats.bernoulli.rvs(p=p)
    return X, Y

X_art, Y_art = generate_dataset(1, 5, 100)

### Classification Model

In [33]:
from sklearn.tree import DecisionTreeClassifier

#### 1. Refitting

In [34]:
# perform on original data
clf1 = DecisionTreeClassifier().fit(X, y)
clf1.score(X, y)

1.0

In [35]:
# perform on artificial data
clf2 = DecisionTreeClassifier().fit(X_art, Y_art)
clf2.score(X_art, Y_art)

1.0

#### 2. Cross Validation

In [36]:
# perform on original data

# k = 5
print(np.mean(cross_val_score(DecisionTreeClassifier(), X, y, cv=5)))

# k = 10
print(np.mean(cross_val_score(DecisionTreeClassifier(), X, y, cv=10)))

0.9156342182890855
0.9157581453634085


In [37]:
# perform on artificial data

# k = 5
print(np.mean(cross_val_score(DecisionTreeClassifier(), X_art, Y_art, cv=5)))

# k = 10
print(np.mean(cross_val_score(DecisionTreeClassifier(), X_art, Y_art, cv=10)))

0.5700000000000001
0.61


#### 3. Bootstrap

In [38]:
# perform on original data
data_c = np.column_stack((X,y))
boot = resample(data_c, replace=True, random_state=1)
oob = np.array([x for x in data_c.tolist() if x not in boot.tolist()])

clf3 = DecisionTreeClassifier().fit(boot[:,0:30], boot[:,30])
clf3.score(oob[:,0:30], oob[:,30])

0.92

In [39]:
# perform on artificial data
data_c = np.column_stack((X_art ,Y_art))
boot = resample(data_c, replace=True, random_state=1)
oob = np.array([x for x in data_c.tolist() if x not in boot.tolist()])

clf4 = DecisionTreeClassifier().fit(boot[:,0:10], boot[:,10])
clf4.score(oob[:,0:10], oob[:,10])

0.53125

#### 4. Boostrap 0.632

In [40]:
# perform on original data
data_c = np.column_stack((X,y))
boot = resample(data_c, replace=True, random_state=1)
oob = np.array([x for x in data_c.tolist() if x not in boot.tolist()])

clf5 = DecisionTreeClassifier().fit(boot[:,0:30], boot[:,30])
print(0.632*clf5.score(oob[:,0:30], oob[:,30]) + (1-0.632)*clf5.score(boot[:,0:30], boot[:,30]))

0.9431200000000001


In [41]:
# perform on artificial data
data_c = np.column_stack((X_art,Y_art))
boot = resample(data_c, replace=True,random_state=1)
oob = np.array([x for x in data_c.tolist() if x not in boot.tolist()])
clf6 = DecisionTreeClassifier().fit(boot[:,0:10], boot[:,10])
print(0.632*clf6.score(oob[:,0:10], oob[:,10]) + (1-0.632)*clf6.score(boot[:,0:10], boot[:,10]))

0.6247499999999999


### For artificial dataset, try different values of n, k and b

### b

In [42]:
for b in np.arange(0,2,0.25):
    print(f"b equal to {b}")
    X_art, y_art = generate_dataset(b, 5, 10000)
    X_train, X_test, y_train, y_test = train_test_split(X_art, y_art)
    clf7 = DecisionTreeClassifier().fit(X_train, y_train)
    print(clf7.score(X_test, y_test))

b equal to 0.0
0.4936
b equal to 0.25
0.528
b equal to 0.5
0.5892
b equal to 0.75
0.672
b equal to 1.0
0.6816
b equal to 1.25
0.732
b equal to 1.5
0.7468
b equal to 1.75
0.7744


### k

In [44]:
for k in np.arange(0,3000,1000):
    print(f"k equal to {k}")
    X_art, y_art = generate_dataset(1, k, 10000)
    X_train, X_test, y_train, y_test = train_test_split(X_art, y_art)
    clf8 = DecisionTreeClassifier().fit(X_train, y_train)
    print(clf8.score(X_test, y_test))

k equal to 0
0.7016
k equal to 1000
0.6528
k equal to 2000
0.642


### n

In [45]:
for n in np.arange(500,5000,1000):
    print(f"n equal to {n}")
    X_art, y_art = generate_dataset(1, 5, n)
    X_train, X_test, y_train, y_test = train_test_split(X_art, y_art)
    clf9 = DecisionTreeClassifier().fit(X_train, y_train)
    print(clf9.score(X_test, y_test))

n equal to 500
0.64
n equal to 1500
0.6986666666666667
n equal to 2500
0.6768
n equal to 3500
0.6925714285714286
n equal to 4500
0.6817777777777778
