# Model creation

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Dataset

In [2]:
df = pd.read_csv("drug_consumption_2.txt", header=None, names=["idx", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # TODO: Rename other columns.

df.head()

Unnamed: 0,idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084,0,0,0,0,0,0,0
1,2,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575,0,0,0,0,0,0,0
2,3,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148,0,0,0,0,0,0,0
3,4,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084,0,0,0,0,0,0,0
4,5,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575,0,0,0,0,0,0,0


In [3]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=0)

In [4]:
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1508 entries, 840 to 684
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   idx     1508 non-null   int64  
 1   1       1508 non-null   float64
 2   2       1508 non-null   float64
 3   3       1508 non-null   float64
 4   4       1508 non-null   float64
 5   5       1508 non-null   float64
 6   6       1508 non-null   float64
 7   7       1508 non-null   float64
 8   8       1508 non-null   int64  
 9   9       1508 non-null   int64  
 10  10      1508 non-null   int64  
 11  11      1508 non-null   int64  
 12  12      1508 non-null   int64  
 13  13      1508 non-null   int64  
 14  14      1508 non-null   int64  
dtypes: float64(7), int64(8)
memory usage: 188.5 KB
<class 'pandas.core.frame.DataFrame'>
Index: 377 entries, 220 to 616
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   idx     377 non-null    int64  
 1 

(None, None)

In [5]:
x_train, y_train = train_df.iloc[:, 1:8], train_df.iloc[:, 8:15]
x_test, y_test = test_df.iloc[:, 1:8], test_df.iloc[:, 8:15]

x_train, y_train

(            1        2        3        4        5        6        7
 840  -0.67825  0.00332  0.29338  1.81866  1.30612 -0.21712 -0.52593
 1317  0.82562  0.96248 -0.17779 -0.30172 -0.27607  1.29221  0.76540
 1385  0.62967  0.47617  0.44585 -0.60633 -0.89891  1.29221  0.40148
 347  -0.58016  0.32197 -0.45174 -0.15487  0.25953 -0.21712 -1.18084
 602  -0.67825 -0.80615 -0.45174 -1.47955  0.12331 -0.71126 -0.21575
 ...       ...      ...      ...      ...      ...      ...      ...
 835  -0.34799  0.32197  0.14143 -1.21213  0.25953  0.19268  0.40148
 1216  0.31287  1.28610 -0.17779 -0.76096 -1.01450  0.19268 -0.52593
 1653  1.72012 -1.23177 -0.71727 -0.60633  0.12331 -1.37983  0.40148
 559   1.02119  0.16767 -0.71727  0.28783  0.41594 -0.71126 -0.52593
 684   1.60383  0.96248  1.43533 -0.30172 -0.89891  0.88113  1.22470
 
 [1508 rows x 7 columns],
       8  9  10  11  12  13  14
 840   0  0   1   0   0   0   0
 1317  0  0   0   0   0   0   0
 1385  0  0   0   0   0   0   0
 347   0  0   0 

## Decision Tree

In [6]:
# Save classifiers in dictionary.
models = {}

# Create classifier for each target.
for target_column in y_train.columns:
    # Create classifier.
    target_clf = DecisionTreeClassifier()
    
    # Save classifier to specific target.
    models[target_column] = target_clf
    
models

{8: DecisionTreeClassifier(),
 9: DecisionTreeClassifier(),
 10: DecisionTreeClassifier(),
 11: DecisionTreeClassifier(),
 12: DecisionTreeClassifier(),
 13: DecisionTreeClassifier(),
 14: DecisionTreeClassifier()}

In [7]:
# Train each model
for model in models.keys():
    models[model].fit(x_train, y_train[model])

In [8]:
# Evaluate each model
metrics = {}

for target, model in models.items():
    y_predictions = model.predict(x_test)
    
    accuracy = accuracy_score(y_test[target], y_predictions)
    precision = precision_score(y_test[target], y_predictions)
    recall = recall_score(y_test[target], y_predictions)
    f1 = f1_score(y_test[target], y_predictions)
    
    metrics[target] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
    }
    
metrics

  _warn_prf(average, modifier, msg_start, len(result))


{8: {'accuracy': 0.8938992042440318,
  'precision': 0.15,
  'recall': 0.11538461538461539,
  'f1_score': 0.13043478260869565},
 9: {'accuracy': 0.8779840848806366,
  'precision': 0.1111111111111111,
  'recall': 0.12,
  'f1_score': 0.11538461538461538},
 10: {'accuracy': 0.6737400530503979,
  'precision': 0.32608695652173914,
  'recall': 0.32967032967032966,
  'f1_score': 0.32786885245901637},
 11: {'accuracy': 0.9575596816976127,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 12: {'accuracy': 0.9973474801061007,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0},
 13: {'accuracy': 0.9257294429708223,
  'precision': 0.1,
  'recall': 0.16666666666666666,
  'f1_score': 0.125},
 14: {'accuracy': 0.986737400530504,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0}}

## K-NN