**Check the DATA1030 environment**

In [5]:
from __future__ import print_function
from distutils.version import LooseVersion as Version
import sys

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.7 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == min_ver:
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))    
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(sys.version)
if pyversion >= "3.7":
    print(OK, "Python version is %s" % sys.version)
elif pyversion < "3.7":
    print(FAIL, "Python version 3.7 is required,"
                " but %s is installed." % sys.version)
else:
    print(FAIL, "Unknown Python version: %s" % sys.version)

    
print()
requirements = {'numpy': "1.18.5", 'matplotlib': "3.2.2",'sklearn': "0.23.1", 
                'pandas': "1.0.5",'xgboost': "1.1.1", 'shap': "0.35.0"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:33:30) 
[Clang 9.0.1 ]

[42m[ OK ][0m numpy version 1.18.5 is installed.
[42m[ OK ][0m matplotlib version 3.2.2 is installed.
[42m[ OK ][0m sklearn version 0.23.1 is installed.
[42m[ OK ][0m pandas version 1.0.5 is installed.
[42m[ OK ][0m xgboost version 1.1.1 is installed.
[42m[ OK ][0m shap version 0.35.0 is installed.


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GroupKFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler

**Read in, split, and preprocess a dataset**

First, read in the diabetes dataset into a pandas dataframe using the tab delimited file linked at [this page](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html).


In [7]:
# read in the data in this cell
df = pd.read_csv("https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt", sep='\t')
df.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [8]:
y = df['Y']
X = df.loc[:, df.columns != 'Y']

scaler_col = list(X.columns)
scaler_col.remove('SEX')

# turn this part into a function
# def prep_basic(X,y,ohe_ftrs,std_ftrs,random_state):
#    return
# X and y are the feature matrix and the target variable
# ohe_ftrs is a list of strings that contains feature names to preprocess with the one hot encoder
# std_ftrs is a list of strings that contains feature names to preprocess with the standard scaler
# random_state is the random state for reproducability
# this function performs a basic train-validation-test split with 60-20-20 ratios

for i in range(10):
    random_state = 42 * i
    
    # split the data
    X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=0.6, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, train_size=0.5, random_state=random_state)
    
    # preprocess the data
    enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
    scaler = StandardScaler()
    
    X_train_ohe = enc.fit_transform(X_train[['SEX']])
    X_val_ohe = enc.transform(X_val[['SEX']])
    X_test_ohe = enc.transform(X_test[['SEX']])
    
    X_train_scaler = scaler.fit_transform(X_train[scaler_col])
    X_val_scaler = scaler.transform(X_val[scaler_col])
    X_test_scaler = scaler.transform(X_test[scaler_col])
    
    X_train_concat = np.concatenate([X_train_ohe, X_train_scaler], axis=1)
    X_val_concat = np.concatenate([X_val_ohe, X_val_scaler], axis=1)
    X_test_concat = np.concatenate([X_test_ohe, X_test_scaler], axis=1)
    
    X_pre_col = list(enc.get_feature_names(['SEX'])) + scaler_col
    X_train_pre = pd.DataFrame(X_train_concat, columns=X_pre_col)
    X_val_pre = pd.DataFrame(X_val_concat, columns=X_pre_col)
    X_test_pre = pd.DataFrame(X_test_concat, columns=X_pre_col)
    
    # print stuff out to make sure your code is reproducable
    print(X_train_pre.head())

   SEX_1  SEX_2       AGE       BMI        BP        S1        S2        S3  \
0    0.0    1.0 -0.101364 -0.835511 -0.948322 -1.661115 -1.549201 -0.698947   
1    0.0    1.0  1.014762  1.252631  1.475618  0.269567  0.435301 -0.473754   
2    0.0    1.0  0.047453  1.275084 -0.066889  1.263300  1.308739 -1.224396   
3    0.0    1.0 -0.175772  3.587757  0.300374  0.610275  0.705039 -0.473754   
4    1.0    0.0 -0.994265  0.129974  0.226922 -0.780951 -0.367491 -0.398690   

         S4        S5        S6  
0 -0.798531  0.205609 -0.189521  
1  0.701191  0.443799  0.537180  
2  2.200913  1.393977  2.475052  
3  0.701191  0.679407  0.617925  
4 -0.048670 -0.806330 -0.431755  
   SEX_1  SEX_2       AGE       BMI        BP        S1        S2        S3  \
0    1.0    0.0 -0.285158  1.213237  1.090432  0.972804  0.610896 -0.385769   
1    1.0    0.0  1.412975  1.123179  1.520650 -0.313446 -0.822460 -0.690588   
2    0.0    1.0  1.335787 -0.587927  0.229994  1.315804  1.010142  0.604894   
3    