# Model creation

In [30]:
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.utils import class_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Dataset

In [2]:
df = pd.read_csv(
    "drug_consumption_2.txt",
    header=None,
    names=[
        "idx",
        "N-Score",
        "E-Score",
        "O-Score",
        "A-Score",
        "C-Score",
        "Impulsiveness",
        "Sensation-seeking",
        "Amphet",
        "Benzo",
        "Cannabis",
        "Heroin",
        "Ketamine",
        "Methadone",
        "Semeron",
    ],
)

df.head()

Unnamed: 0,idx,N-Score,E-Score,O-Score,A-Score,C-Score,Impulsiveness,Sensation-seeking,Amphet,Benzo,Cannabis,Heroin,Ketamine,Methadone,Semeron
0,1,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084,0,0,0,0,0,0,0
1,2,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575,0,0,0,0,0,0,0
2,3,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148,0,0,0,0,0,0,0
3,4,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084,0,0,0,0,0,0,0
4,5,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575,0,0,0,0,0,0,0


In [3]:
# Normalize score ranges to be [0, 1].
for column in df.columns[1:8]:
    column_min, column_max = df[column].min(), df[column].max()
    
    column_normalized = (df[column] - column_min) / (column_max - column_min)
    
    df[column] = column_normalized

In [4]:
print("Column ranges:")
for column in df.columns:
    column_range = (df[column].min(), df[column].max())
    
    print(f"   { column}: [{column_range[0]}, { column_range[1] }]")

Column ranges:
   idx: [1, 1888]
   N-Score: [0.0, 1.0]
   E-Score: [0.0, 1.0]
   O-Score: [0.0, 1.0]
   A-Score: [0.0, 1.0]
   C-Score: [0.0, 1.0]
   Impulsiveness: [0.0, 1.0]
   Sensation-seeking: [0.0, 1.0]
   Amphet: [0, 1]
   Benzo: [0, 1]
   Cannabis: [0, 1]
   Heroin: [0, 1]
   Ketamine: [0, 1]
   Methadone: [0, 1]
   Semeron: [0, 1]


In [5]:
print("Class counts:")

for target in df.iloc[:, 8:15].columns:
    # Get train and test data splits, stratisfy for target.
    target_train_df, target_test_df = train_test_split(
        df, train_size=0.8, shuffle=True, stratify=df[target], random_state=0
    )

    # Get input and target from the data split.
    target_x_train, target_y_train = (
        target_train_df.iloc[:, 1:8],
        target_train_df.iloc[:, 8:15][target],
    )
    target_x_test, target_y_test = (
        target_test_df.iloc[:, 1:8],
        target_test_df.iloc[:, 8:15][target],
    )
    
    print(f"  { target } dataset\n     Train dataset: 0: { len(target_y_train[target_y_train == 0]) }, 1: { len(target_y_train[target_y_train == 1]) }\n     Test dataset: 0: { len(target_y_test[target_y_test == 0]) }, 1: { len(target_y_test[target_y_test == 1]) }")

Class counts:
  Amphet dataset
     Train dataset: 0: 1426, 1: 82
     Test dataset: 0: 357, 1: 20
  Benzo dataset
     Train dataset: 0: 1432, 1: 76
     Test dataset: 0: 358, 1: 19
  Cannabis dataset
     Train dataset: 0: 1138, 1: 370
     Test dataset: 0: 284, 1: 93
  Heroin dataset
     Train dataset: 0: 1485, 1: 23
     Test dataset: 0: 371, 1: 6
  Ketamine dataset
     Train dataset: 0: 1505, 1: 3
     Test dataset: 0: 376, 1: 1
  Methadone dataset
     Train dataset: 0: 1450, 1: 58
     Test dataset: 0: 362, 1: 15
  Semeron dataset
     Train dataset: 0: 1502, 1: 6
     Test dataset: 0: 375, 1: 2


## Decision Tree

In [15]:
models = {}
metrics = {}

for target in df.iloc[:, 8:15].columns:
    # Get train and test data splits, stratisfy for target.
    target_train_df, target_test_df = train_test_split(
        df, train_size=0.8, shuffle=True, stratify=df[target], random_state=0
    )

    # Get input and target from the data split.
    target_x_train, target_y_train = (
        target_train_df.iloc[:, 1:8],
        target_train_df.iloc[:, 8:15][target],
    )
    target_x_test, target_y_test = (
        target_test_df.iloc[:, 1:8],
        target_test_df.iloc[:, 8:15][target],
    )

    # Create classifier.
    target_clf = DecisionTreeClassifier(criterion="gini", max_depth=15, min_samples_leaf=3, class_weight="balanced", random_state=0)

    # Train model with data specified for target.
    target_clf.fit(target_x_train, target_y_train)
    models[target] = target_clf
    
    joblib.dump(target_clf, f"./models/decision_tree_{ target.lower() }.joblib", compress=3)

    # Evaluate trained classifier.
    target_y_predictions = target_clf.predict(target_x_test)

    # Calculate metrics.
    accuracy = accuracy_score(target_y_test, target_y_predictions)
    precision = precision_score(target_y_test, target_y_predictions, zero_division=np.nan)
    recall = recall_score(target_y_test, target_y_predictions)
    f1 = f1_score(target_y_test, target_y_predictions)

    metrics[target] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }

models, metrics

({'Amphet': DecisionTreeClassifier(class_weight='balanced', max_depth=15,
                         min_samples_leaf=3, random_state=0),
  'Benzo': DecisionTreeClassifier(class_weight='balanced', max_depth=15,
                         min_samples_leaf=3, random_state=0),
  'Cannabis': DecisionTreeClassifier(class_weight='balanced', max_depth=15,
                         min_samples_leaf=3, random_state=0),
  'Heroin': DecisionTreeClassifier(class_weight='balanced', max_depth=15,
                         min_samples_leaf=3, random_state=0),
  'Ketamine': DecisionTreeClassifier(class_weight='balanced', max_depth=15,
                         min_samples_leaf=3, random_state=0),
  'Methadone': DecisionTreeClassifier(class_weight='balanced', max_depth=15,
                         min_samples_leaf=3, random_state=0),
  'Semeron': DecisionTreeClassifier(class_weight='balanced', max_depth=15,
                         min_samples_leaf=3, random_state=0)},
 {'Amphet': {'accuracy': 0.8620689655172

## K-NN

In [19]:
models = {}
metrics = {}

for target in df.iloc[:, 8:15].columns:
    # Get train and test data splits, stratisfy for target.
    target_train_df, target_test_df = train_test_split(
        df, train_size=0.8, shuffle=True, stratify=df[target], random_state=0
    )

    # Get input and target from the data split.
    target_x_train, target_y_train = (
        target_train_df.iloc[:, 1:8],
        target_train_df.iloc[:, 8:15][target],
    )
    target_x_test, target_y_test = (
        target_test_df.iloc[:, 1:8],
        target_test_df.iloc[:, 8:15][target],
    )

    # Create classifier.
    target_neigh = KNeighborsClassifier(
        n_neighbors=3, 
    )

    # Train model with data specified for target.
    target_neigh.fit(target_x_train, target_y_train)
    models[target] = target_neigh
    
    joblib.dump(target_neigh, f"./models/knn_{ target.lower() }.joblib", compress=3)

    # Evaluate trained classifier.
    target_y_predictions = models[target].predict(target_x_test)

    # Calculate metrics.
    accuracy = accuracy_score(target_y_test, target_y_predictions)
    precision = precision_score(target_y_test, target_y_predictions, zero_division=np.nan)
    recall = recall_score(target_y_test, target_y_predictions)
    f1 = f1_score(target_y_test, target_y_predictions)

    metrics[target] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }

models, metrics

({'Amphet': KNeighborsClassifier(n_neighbors=3),
  'Benzo': KNeighborsClassifier(n_neighbors=3),
  'Cannabis': KNeighborsClassifier(n_neighbors=3),
  'Heroin': KNeighborsClassifier(n_neighbors=3),
  'Ketamine': KNeighborsClassifier(n_neighbors=3),
  'Methadone': KNeighborsClassifier(n_neighbors=3),
  'Semeron': KNeighborsClassifier(n_neighbors=3)},
 {'Amphet': {'accuracy': 0.9442970822281167,
   'precision': 0.3333333333333333,
   'recall': 0.05,
   'f1_score': 0.08695652173913045},
  'Benzo': {'accuracy': 0.9283819628647215,
   'precision': 0.1,
   'recall': 0.05263157894736842,
   'f1_score': 0.06896551724137931},
  'Cannabis': {'accuracy': 0.7002652519893899,
   'precision': 0.375,
   'recall': 0.3225806451612903,
   'f1_score': 0.34682080924855496},
  'Heroin': {'accuracy': 0.9840848806366048,
   'precision': nan,
   'recall': 0.0,
   'f1_score': 0.0},
  'Ketamine': {'accuracy': 0.9973474801061007,
   'precision': nan,
   'recall': 0.0,
   'f1_score': 0.0},
  'Methadone': {'accurac

## Random Forest

In [28]:
models = {}
metrics = {}

for target in df.iloc[:, 8:15].columns:
    # Get train and test data splits, stratisfy for target.
    target_train_df, target_test_df = train_test_split(
        df, train_size=0.8, shuffle=True, stratify=df[target], random_state=0
    )

    # Get input and target from the data split.
    target_x_train, target_y_train = (
        target_train_df.iloc[:, 1:8],
        target_train_df.iloc[:, 8:15][target],
    )
    target_x_test, target_y_test = (
        target_test_df.iloc[:, 1:8],
        target_test_df.iloc[:, 8:15][target],
    )

    # Create classifier.
    target_clf = RandomForestClassifier(
        n_estimators=50, 
        max_depth=25,
        min_samples_leaf=3,
        class_weight="balanced",
        random_state=0
    )

    # Train model with data specified for target.
    target_clf.fit(target_x_train, target_y_train)
    models[target] = target_clf
    
    joblib.dump(target_clf, f"./models/random_forest_{ target.lower() }.joblib", compress=3)

    # Evaluate trained classifier.
    target_y_predictions = target_clf.predict(target_x_test)

    # Calculate metrics.
    accuracy = accuracy_score(target_y_test, target_y_predictions)
    precision = precision_score(target_y_test, target_y_predictions, zero_division=np.nan)
    recall = recall_score(target_y_test, target_y_predictions)
    f1 = f1_score(target_y_test, target_y_predictions)

    metrics[target] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }

models, metrics

({'Amphet': RandomForestClassifier(class_weight='balanced', max_depth=25,
                         min_samples_leaf=3, n_estimators=50, random_state=0),
  'Benzo': RandomForestClassifier(class_weight='balanced', max_depth=25,
                         min_samples_leaf=3, n_estimators=50, random_state=0),
  'Cannabis': RandomForestClassifier(class_weight='balanced', max_depth=25,
                         min_samples_leaf=3, n_estimators=50, random_state=0),
  'Heroin': RandomForestClassifier(class_weight='balanced', max_depth=25,
                         min_samples_leaf=3, n_estimators=50, random_state=0),
  'Ketamine': RandomForestClassifier(class_weight='balanced', max_depth=25,
                         min_samples_leaf=3, n_estimators=50, random_state=0),
  'Methadone': RandomForestClassifier(class_weight='balanced', max_depth=25,
                         min_samples_leaf=3, n_estimators=50, random_state=0),
  'Semeron': RandomForestClassifier(class_weight='balanced', max_depth=25,
 

## ANN

In [6]:
def create_ann_model(class_weights):
    model = tf.keras.models.Sequential(
        [
            tf.keras.layers.Dense(64, activation="relu", input_shape=(7,)),
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(1, activation="sigmoid"),
        ]
    )

    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", "precision", "recall"],
        class_weight=class_weights,
    )

    return model

In [7]:
models = {}
metrics = {}

for target in df.iloc[:, 8:15].columns:
    # Get train and test data splits, stratisfy for target.
    target_train_df, target_test_df = train_test_split(
        df, train_size=0.8, shuffle=True, stratify=df[target], random_state=0
    )

    # Get input and target from the data split.
    target_x_train, target_y_train = (
        target_train_df.iloc[:, 1:8],
        target_train_df.iloc[:, 8:15][target],
    )
    target_x_test, target_y_test = (
        target_test_df.iloc[:, 1:8],
        target_test_df.iloc[:, 8:15][target],
    )
    
    # Calculate class weights.
    target_class_weights = dict(enumerate(class_weight.compute_class_weight("balanced", classes=np.unique(target_y_train), y=target_y_train)))

    # Create classifier.
    target_model = create_ann_model(target_class_weights)

    # Train model with data specified for target.
    target_model.fit(target_x_train, target_y_train, epochs=25, batch_size=16, verbose=0)
    models[target] = target_model
    
    target_model.save(f"./models/ann_{ target }.h5")

    # Evaluate trained classifier.
    target_y_predictions = (target_model.predict(target_x_test) >= 0.5).astype("int32")

    # Calculate metrics.
    accuracy = accuracy_score(target_y_test, target_y_predictions)
    precision = precision_score(target_y_test, target_y_predictions)
    recall = recall_score(target_y_test, target_y_predictions)
    f1 = f1_score(target_y_test, target_y_predictions)

    metrics[target] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }

models, metrics

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


  _warn_prf(average, modifier, msg_start, len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


  _warn_prf(average, modifier, msg_start, len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


  _warn_prf(average, modifier, msg_start, len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


  _warn_prf(average, modifier, msg_start, len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


  _warn_prf(average, modifier, msg_start, len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


KeyboardInterrupt: 