# Config

In [138]:
import pandas as pd
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn import svm
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Load datasets

In [4]:
dlbcl_train = pd.read_csv(r'data/dlbcl_training.csv')
dlbcl_test = pd.read_csv(r'data/dlbcl_testing.csv')

prostate_train = pd.read_csv(r'data/prostate_training.csv')
prostate_test = pd.read_csv(r'data/prostate_testing.csv')

In [8]:
print('Shapes of datasets: \n')
print(f"dlbcl_train: {dlbcl_train.shape}")
print(f"dlbcl_test: {dlbcl_test.shape}\n")
print(f"prostate_train: {prostate_train.shape}")
print(f"prostate_test: {prostate_test.shape}")

Shapes of datasets: 

dlbcl_train: (61, 2648)
dlbcl_test: (16, 2648)

prostate_train: (82, 2136)
prostate_test: (20, 2136)


# DLBCL feature slection

In [None]:
X = dlbcl_train.loc[:, dlbcl_train.columns != 'class']
Y = dlbcl_train.loc[:, dlbcl_train.columns == 'class']

X_test = dlbcl_test.loc[:, dlbcl_test.columns != 'class']
Y_test = dlbcl_test.loc[:, dlbcl_test.columns == 'class']

## KBest

In [121]:
# feature extraction
test = SelectKBest(score_func=f_classif, k=50)
fit = test.fit(X, Y)

  y = column_or_1d(y, warn=True)


In [122]:
# summarize scores
set_printoptions(precision=5)
print(fit.scores_)

[0.29624 2.57349 0.14217 ... 2.92261 2.88384 0.51462]


In [123]:
features = fit.transform(X)
features_test = fit.transform(X_test)

#### SVM

In [124]:
clf = svm.SVC(kernel='linear')
clf.fit(features, Y)
y_pred = clf.predict(features_test)

  y = column_or_1d(y, warn=True)


In [125]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
print("Precision:",metrics.precision_score(Y_test, y_pred))
print("Recall:",metrics.recall_score(Y_test, y_pred))

Accuracy: 0.8125
Precision: 0.8
Recall: 1.0


#### 5NN

In [126]:
knn = KNeighborsClassifier(n_neighbors = 5) #setting up the KNN model to use 5NN
knn.fit(features, Y) #fitting the KNN
y_pred_knn = knn.predict(features_test)

  return self._fit(X, y)


In [127]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_knn))
print("Precision:",metrics.precision_score(Y_test, y_pred_knn))
print("Recall:",metrics.recall_score(Y_test, y_pred_knn))

Accuracy: 0.875
Precision: 0.8571428571428571
Recall: 1.0


#### J48

In [128]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(features,Y)
y_pred_dtc = dtc.predict(features_test)

In [129]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_dtc))
print("Precision:",metrics.precision_score(Y_test, y_pred_dtc))
print("Recall:",metrics.recall_score(Y_test, y_pred_dtc))

Accuracy: 0.6875
Precision: 0.8888888888888888
Recall: 0.6666666666666666


## RFE

In [141]:
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=50)
rfe.fit(X, Y)

In [151]:
features = rfe.transform(X)
features_test = rfe.transform(X_test)

#### SVM

In [152]:
clf = svm.SVC(kernel='linear')
clf.fit(features, Y)
y_pred = clf.predict(features_test)

  y = column_or_1d(y, warn=True)


In [153]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
print("Precision:",metrics.precision_score(Y_test, y_pred))
print("Recall:",metrics.recall_score(Y_test, y_pred))

Accuracy: 0.75
Precision: 0.7857142857142857
Recall: 0.9166666666666666


#### 5NN

In [154]:
knn = KNeighborsClassifier(n_neighbors = 5) #setting up the KNN model to use 5NN
knn.fit(features, Y) #fitting the KNN
y_pred_knn = knn.predict(features_test)

  return self._fit(X, y)


In [155]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_knn))
print("Precision:",metrics.precision_score(Y_test, y_pred_knn))
print("Recall:",metrics.recall_score(Y_test, y_pred_knn))

Accuracy: 0.875
Precision: 0.8571428571428571
Recall: 1.0


#### J48

In [156]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(features,Y)
y_pred_dtc = dtc.predict(features_test)

In [157]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_dtc))
print("Precision:",metrics.precision_score(Y_test, y_pred_dtc))
print("Recall:",metrics.recall_score(Y_test, y_pred_dtc))

Accuracy: 0.875
Precision: 0.8571428571428571
Recall: 1.0


## Without feature selection

#### SVM 

In [158]:
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [159]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
print("Precision:",metrics.precision_score(Y_test, y_pred))
print("Recall:",metrics.recall_score(Y_test, y_pred))

Accuracy: 0.875
Precision: 0.9166666666666666
Recall: 0.9166666666666666


#### 5NN

In [160]:
knn = KNeighborsClassifier(n_neighbors = 5) #setting up the KNN model to use 5NN
knn.fit(X, Y) #fitting the KNN
y_pred_knn = knn.predict(X_test)

  return self._fit(X, y)


In [161]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_knn))
print("Precision:",metrics.precision_score(Y_test, y_pred_knn))
print("Recall:",metrics.recall_score(Y_test, y_pred_knn))

Accuracy: 0.8125
Precision: 0.8461538461538461
Recall: 0.9166666666666666


#### J48

In [162]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X,Y)
y_pred_dtc = dtc.predict(X_test)

In [163]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_dtc))
print("Precision:",metrics.precision_score(Y_test, y_pred_dtc))
print("Recall:",metrics.recall_score(Y_test, y_pred_dtc))

Accuracy: 0.8125
Precision: 0.9090909090909091
Recall: 0.8333333333333334


# Prostate feature selection

In [164]:
X = prostate_train.loc[:, prostate_train.columns != 'class']
Y = prostate_train.loc[:, prostate_train.columns == 'class']

X_test = prostate_test.loc[:, prostate_test.columns != 'class']
Y_test = prostate_test.loc[:, prostate_test.columns == 'class']

## KBest

In [168]:
# feature extraction
test = SelectKBest(score_func=f_classif, k=50)
fit = test.fit(X, Y)

  y = column_or_1d(y, warn=True)


In [169]:
# summarize scores
set_printoptions(precision=5)
print(fit.scores_)

[2.05414 6.60965 0.08449 ... 0.22597 3.89381 1.79179]


In [170]:
features = fit.transform(X)
features_test = fit.transform(X_test)

#### SVM

In [171]:
clf = svm.SVC(kernel='linear')
clf.fit(features, Y)
y_pred = clf.predict(features_test)

  y = column_or_1d(y, warn=True)


In [173]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.85


#### 5NN

In [175]:
knn = KNeighborsClassifier(n_neighbors = 5) #setting up the KNN model to use 5NN
knn.fit(features, Y) #fitting the KNN
y_pred_knn = knn.predict(features_test)

  return self._fit(X, y)


In [177]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_knn))

Accuracy: 0.85


#### J48

In [178]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X,Y)
y_pred_dtc = dtc.predict(X_test)

In [179]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_dtc))

Accuracy: 0.9


## RFE

In [180]:
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=50)
rfe.fit(X, Y)

In [181]:
features = rfe.transform(X)
features_test = rfe.transform(X_test)

#### SVM

In [182]:
clf = svm.SVC(kernel='linear')
clf.fit(features, Y)
y_pred = clf.predict(features_test)

  y = column_or_1d(y, warn=True)


In [184]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.85


#### 5NN

In [185]:
knn = KNeighborsClassifier(n_neighbors = 5) #setting up the KNN model to use 5NN
knn.fit(features, Y) #fitting the KNN
y_pred_knn = knn.predict(features_test)

  return self._fit(X, y)


In [186]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_knn))

Accuracy: 0.8


#### J48

In [187]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(features,Y)
y_pred_dtc = dtc.predict(features_test)

In [188]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_dtc))

Accuracy: 0.9


## Without feature selection

#### SVM

In [189]:
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [190]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.85


#### 5NN

In [191]:
knn = KNeighborsClassifier(n_neighbors = 5) #setting up the KNN model to use 5NN
knn.fit(X, Y) #fitting the KNN
y_pred_knn = knn.predict(X_test)

  return self._fit(X, y)


In [192]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_knn))

Accuracy: 0.75


#### J48

In [193]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X,Y)
y_pred_dtc = dtc.predict(X_test)

In [194]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_dtc))

Accuracy: 0.9
