# Import dependencies

In [68]:
from ucimlrepo import fetch_ucirepo, list_available_datasets
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Config Params

In [69]:
TEST_SIZE = 0.2
RANDOM_SEED = 42
K_FOLDS = 3

# Our chosen datasets
TODO
We have chosen the datasets based on the following criteria:
- datasets where we do not need to do any special preprocessing so that it is easy to do in only one pipeline
- rather small datasets to ensure we do not need high computational power

In [70]:
# Dataset that seems useful
dataset_id = {"iris": 53, 
              "heart_disease": 45,
              "wine_quality": 186,
              "breast_cancer_wisconsin": 17,
              "car_evaluation": 19, # Not good maybe remove
              "abalone": 1,
              "mushroom": 73, # a lot of categorical data
              "statlog" : 144, # german credit
              "student_performance" : 320,
              "accute_inflammation" : 284,
              "credit_approval" : 143,
              "wholesale_customers" : 292,
              "glass_identifcation" : 42,
              "ilpd" : 225,
              "hcv" : 503,
              "land_mines" : 763,
              "balance_scale" : 12
              }


# Import and preprocess datasets
For the preprocessing we will do the following steps:
1. Remove any missing values. In the article the following is written: "Given that our classifiers are not oriented to data with missing features, the missing inputs are treated as zero, which should not bias the comparison results." We therefore also decided to just remove missing values and to more focus on the full pipeline instead of single datasets. Another way could have been interpolation.
2. Encode categorical data into numerical data. This we have to do to use the classifiers later on.
3. Remove certain columns if they are highly correlated to others.
4. Split the data into a train and a test set. We will use a 80/20 split.
5. Scale the data so that we have zero mean and standard deviation of one. This is done with the Standard scaler.

In [71]:
# function for converting categorical features into numerical
def encode_categorical_features(X, encoder):
    X = encoder.fit_transform(X)
    return X

In [72]:
def import_dataset(uci_id, encoder):
    # get the dataset
    dataset = fetch_ucirepo(id=uci_id) 
    # load data into dataframe for easier preprocessing
    df = pd.concat([dataset["data"]["features"],dataset["data"]["targets"] ],axis=1)
    # remove nan values
    df.dropna(inplace=True)
    
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1:]
    # encode categorical data only for features not for target itself
    # https://stackoverflow.com/questions/29803093/check-which-columns-in-dataframe-are-categorical
    cols = X.columns
    num_cols = X._get_numeric_data().columns
    #print(num_cols)
    categorical_cols = list(set(cols) - set(num_cols))
    #print(categorical_cols)
    X.loc[:, categorical_cols] = encode_categorical_features(X[categorical_cols], encoder)
    
    # check if encoding has worked
    # https://stackoverflow.com/questions/26924904/check-if-dataframe-column-is-categorical
    for c in X.columns:
        if X[c].dtype.name == "category":
            print(f"WARNING: Column {c} still has categorical values!")
            
    # last column is target
    return X, y

In [73]:
ordinal_encoder = OrdinalEncoder()
X, y = import_dataset(dataset_id["breast_cancer_wisconsin"], ordinal_encoder)
print(y)
#print(X)

    Diagnosis
0           M
1           M
2           M
3           M
4           M
..        ...
564         M
565         M
566         M
567         M
568         B

[569 rows x 1 columns]


In [74]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [75]:
log_reg = LogisticRegression(max_iter=1000)

# Ravel to convert from (len, 1) shape to (len,), warning from sk-learn
Y_train = np.ravel(y_train)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

scores = cross_val_score(
    log_reg, X_scaled, Y_train, scoring='accuracy', cv=5)
# accuracy
print('Accuracy: %.3f ,\nStandard Deviations :%.3f' %
      (np.mean(scores), np.std(scores)))

#TODO check if this is better implementation
# k_folds  = KFold(n_splits=K_FOLDS)
# for train_idx, valid_idx in k_folds.split(X_train):
#     continue

Accuracy: 0.974 ,
Standard Deviations :0.018


In [None]:
# https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee
pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
params = [{'knn__n_neighbors': [3, 5, 7, 9],
         'knn__weights': ['uniform', 'distance'],
         'knn__leaf_size': [15, 20]}]
gs_knn = GridSearchCV(pipe,
                      param_grid=params,
                      scoring='accuracy',
                      cv=5)

# Ravel to convert from (len, 1) shape to (len,), warning from sk-learn
y_train = np.ravel(y_train)

gs_knn.fit(X_train, y_train)
print(gs_knn.best_params_)
# find best model score
gs_knn.score(X_train, y_train)

# test on test set


{'knn__leaf_size': 15, 'knn__n_neighbors': 5, 'knn__weights': 'uniform'}


0.9802197802197802