In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('../data/Drug_Consumption.csv')

# remove ID column and semeron (fictitious drug introduced to identify over-claimers)
df = df.drop('ID', axis = 1)
df = df.drop('Semer', axis = 1)
# df.head()

In [10]:
all_subs = set(df.columns[13:])

# used legal substance within in last WEEK or more frequent: 1, otherwise: 0
legal = {'Alcohol', 'Caff', 'Choc', 'Legalh', 'Nicotine'}
mapping_legal = {'CL0': 0, 'CL1': 0, 'CL2': 0, 'CL3': 0, 'CL4': 0, 'CL5': 1, 'CL6': 1}
for subs in legal:
    df = df.replace({subs: mapping_legal})

# used illegal substance within in last YEAR or more frequent: 1, otherwise: 0
illegal = all_subs - legal
# print(illegal)
mapping_illegal = {'CL0': 0, 'CL1': 0, 'CL2': 0, 'CL3': 1, 'CL4': 1, 'CL5': 1, 'CL6': 1}
for subs in illegal:
    df = df.replace({subs: mapping_legal})

df['NumIllegal']= df[list(illegal)].sum(axis=1)

y = df['NumIllegal']
X = df.loc[:, df.columns != 'NumIllegal']

In [11]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

In [12]:
# baseline RMSE
baseline_pred = np.ones(len(y))*np.mean(y)
baseline_pred_round = np.ones(len(y))*np.round(np.mean(y))

baseline_RMSE = np.sqrt(mean_squared_error(y,baseline_pred))
print('The baseline root mean squared error is:', baseline_RMSE)

baseline_accuracy = accuracy_score(y,np.zeros(len(y))) # predict class 0 for all points
print('The baseline accuracy is:', baseline_accuracy)

baseline_acc_from_mean = accuracy_score(y, baseline_pred_round) # predict rounded mean for all points
print('The baseline accuracy of rounded mean is:', baseline_acc_from_mean)

The baseline root mean squared error is: 1.1219791124813923
The baseline accuracy is: 0.552547770700637
The baseline accuracy of rounded mean is: 0.24363057324840764


In [13]:
# getting ready for preprocessing
ordinal_ftrs = ['Age','Education']
ordinal_cats = [['18-24', '25-34', '35-44', '45-54', '55-64', '65+'], \
                ['Left school before 16 years', 'Left school at 16 years', 'Left school at 17 years',\
                'Left school at 18 years', 'Some college or university, no certificate or degree',\
                'Professional certificate/ diploma', 'University degree', 'Masters degree', 'Doctorate degree']]

onehot_ftrs = ['Gender','Country','Ethnicity']

std_ftrs = ['Nscore', 'Escore', 'Oscore', 'AScore', 'Cscore', 'Impulsive', 'SS']

prep = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories = ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore',drop='if_binary'), onehot_ftrs),
        ('std', StandardScaler(), std_ftrs)])

In [14]:
# function
def MLpipe_KFold_RMSE(X, y, clf, param_grid):    
    num_rs = 5
    test_RMSE = np.zeros(num_rs)
    test_accuracy = np.zeros(num_rs)
    final_models = []
    
    for rs in range(num_rs):

        # 80 train and validation, 20 test
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state = rs)

        # K fold on other
        kf = KFold(n_splits = 4, shuffle = True, random_state = rs)
    
        # preprocess
        X_other = prep.fit_transform(X_other)
        X_test = prep.transform(X_test)

        # standard scale all features
        scaler = StandardScaler()
        X_other = scaler.fit_transform(X_other)
        X_test = scaler.transform(X_test)
        
        # GridSearchCV
        try:
            model = clf(random_state = rs)
        except:
            model = clf()
        pipe = make_pipeline(model)
    
        grid = GridSearchCV(pipe, param_grid, scoring = 'neg_root_mean_squared_error',
                            cv=kf, return_train_score = True, n_jobs=-1, verbose=True)

        grid.fit(X_other, y_other)
        # results = pd.DataFrame(grid.cv_results_)
        # print(results)

        print('Best model parameters:',grid.best_params_)
        print('Validation RMSE score:',-grid.best_score_) # turn neg_root_mean_squared_error positive
        final_models.append(grid)
        
        # calculate and save the test RMSE
        y_test_pred = final_models[-1].predict(X_test)
        test_RMSE[rs] =  np.sqrt(mean_squared_error(y_test,y_test_pred))
        print('Test RMSE score:',test_RMSE[rs])
        
        # round regression prediction
        y_test_pred_rounded = y_test_pred.round()
        test_accuracy[rs] = accuracy_score(y_test, y_test_pred_rounded)
        
    return final_models, test_RMSE, test_accuracy

In [26]:
# Model 1: elastic net

from sklearn.linear_model import ElasticNet
clf = ElasticNet

param_grid = {
              'elasticnet__alpha': [0.005, 0.01, 0.05, 0.1, 0.5],
              'elasticnet__l1_ratio': [0.05, 0.1, 0.2, 0.4],
              'elasticnet__max_iter': [3000]
              } 

final_models, test_RMSE, test_accuracy = MLpipe_KFold_RMSE(X, y, clf, param_grid)

print('\nMean of test RMSE score:', np.mean(test_RMSE))
print('Standard deviation of test RMSE score:', np.std(test_RMSE))

print('\nAccuracy of rounded prediction:', np.mean(test_accuracy))

Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.1, 'elasticnet__max_iter': 3000}
Validation RMSE score: 0.9339326439904706
Test RMSE score: 0.9303891559329407
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.1, 'elasticnet__max_iter': 3000}
Validation RMSE score: 0.9351285849676448
Test RMSE score: 0.9445193474122403
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.1, 'elasticnet__max_iter': 3000}
Validation RMSE score: 0.9317448179364423
Test RMSE score: 0.9365948275593227
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.1, 'elasticnet__max_iter': 3000}
Validation RMSE score: 0.9204496300516417
Test RMSE score: 0.9778269700906712
Fitting 

In [27]:
# Model 2: KNN regressor 

from sklearn.neighbors import KNeighborsRegressor
clf = KNeighborsRegressor

param_grid = {
              'kneighborsregressor__n_neighbors': [80, 100, 120, 140, 160],
              'kneighborsregressor__weights': ['uniform', 'distance']
              } 

final_models, test_RMSE, test_accuracy = MLpipe_KFold_RMSE(X, y, clf, param_grid)

print('\nMean of test RMSE score:', np.mean(test_RMSE))
print('Standard deviation of test RMSE score:', np.std(test_RMSE))

print('\nAccuracy of rounded prediction:', np.mean(test_accuracy))

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best model parameters: {'kneighborsregressor__n_neighbors': 160, 'kneighborsregressor__weights': 'distance'}
Validation RMSE score: 0.9364442135230503
Test RMSE score: 0.91702651477412
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best model parameters: {'kneighborsregressor__n_neighbors': 140, 'kneighborsregressor__weights': 'distance'}
Validation RMSE score: 0.9323245155421198
Test RMSE score: 0.939922360988858
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best model parameters: {'kneighborsregressor__n_neighbors': 140, 'kneighborsregressor__weights': 'distance'}
Validation RMSE score: 0.9340190618935501
Test RMSE score: 0.9298086354622838
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best model parameters: {'kneighborsregressor__n_neighbors': 120, 'kneighborsregressor__weights': 'distance'}
Validation RMSE score: 0.9205800451175158
Test RMSE score: 0.992293357984388
Fitting 4 fo

In [30]:
# Model 3: random forest regressor

from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor

param_grid = {
              'randomforestregressor__max_depth': [1, 5, 10, 15, 20],
              'randomforestregressor__max_features': [0.1, 0.25, 0.5]
              }

final_models, test_RMSE, test_accuracy = MLpipe_KFold_RMSE(X, y, clf, param_grid)

print('\nMean of test RMSE score:', np.mean(test_RMSE))
print('Standard deviation of test RMSE score:', np.std(test_RMSE))

print('\nAccuracy of rounded prediction:', np.mean(test_accuracy))

Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'randomforestregressor__max_depth': 5, 'randomforestregressor__max_features': 0.5}
Validation RMSE score: 0.9317524216993301
Test RMSE score: 0.9137564818857843
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 0.1}
Validation RMSE score: 0.9291018611324615
Test RMSE score: 0.9446869202975555
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'randomforestregressor__max_depth': 5, 'randomforestregressor__max_features': 0.25}
Validation RMSE score: 0.922580205261825
Test RMSE score: 0.9395254742695885
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'randomforestregressor__max_depth': 5, 'randomforestregressor__max_features': 0.25}
Validation RMSE score: 0.9188753287130058
Test RMSE score: 0.9861618626214286
Fitting 4 fold

In [32]:
# Model 4: support vector mechine 

from sklearn.svm import SVR

clf = SVR

param_grid = {
              'svr__gamma': [0.0001, 0.001, 0.01, 0.1, 1],
              'svr__C': [0.1, 1, 10, 100]
              }

final_models, test_RMSE, test_accuracy = MLpipe_KFold_RMSE(X, y, clf, param_grid)

print('\nMean of test RMSE score:', np.mean(test_RMSE))
print('Standard deviation of test RMSE score:', np.std(test_RMSE))

print('\nAccuracy of rounded prediction:', np.mean(test_accuracy))

Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'svr__C': 1, 'svr__gamma': 0.01}
Validation RMSE score: 0.965968375990625
Test RMSE score: 0.9720136673857176
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'svr__C': 1, 'svr__gamma': 0.1}
Validation RMSE score: 0.9619443037196376
Test RMSE score: 0.9762630377205486
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'svr__C': 1, 'svr__gamma': 0.01}
Validation RMSE score: 0.9607164175882124
Test RMSE score: 0.9747927395849345
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'svr__C': 1, 'svr__gamma': 0.01}
Validation RMSE score: 0.9483919706458896
Test RMSE score: 1.0241356316629415
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'svr__C': 1, 'svr__gamma': 0.01}
Validation RMSE score: 0.9681208900442161
Test RMSE score: 0.9943200981283246

Mean of test RMSE sco