## For this notebook, we will programmatically review features for accuracy
This notebook contains a workflow:
- Start with features captured as part of baseline assessment
- Programmatically add new features, representing tests and surveys from the different drug classes
<br>
There are 99 baseline features and 56 optional features to test for prediction value<br>
We will capture the top 16 features from the surveys and tests, in terms of best accuracy on the test set<br>

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display,Markdown

# Load the data
data = pd.read_csv('../data/all_features.csv')
new_features = pd.read_csv('../data/new_features.csv')
baseline_features = pd.read_csv('../data/baseline_features.csv')
del baseline_features['dropout'] # remove the target column

In [38]:
X_features = baseline_features.columns
Y_labels = 'dropout'
new_features = new_features.columns


In [39]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Assuming data is your DataFrame, and X_features, Y_labels, new_features are your lists

def evaluate_model_with_features(X_features, Y_labels, new_features, data):
	results = []
	X = data[X_features]
	Y = data[Y_labels]
	
	# Split data into training and testing sets
	X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
	
	for feature in new_features:
		# Add the new feature to X
		X_train_expanded = X_train.copy()
		X_test_expanded = X_test.copy()
		X_train_expanded[feature] = data[feature][:len(X_train)]
		X_test_expanded[feature] = data[feature][-len(X_test):]
		
		# Define the model and grid search parameters
		model = xgb.XGBClassifier(use_label_encoder=False)
		parameters = {'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 200]}
		grid_search = GridSearchCV(model, parameters, cv=5, scoring='f1')
		
		# Perform grid search and train the model
		grid_search.fit(X_train_expanded, Y_train)
		best_model = grid_search.best_estimator_
		
		# Evaluate the model
		Y_train_pred = best_model.predict(X_train_expanded)
		Y_test_pred = best_model.predict(X_test_expanded)
		train_accuracy = accuracy_score(Y_train, Y_train_pred)
		test_accuracy = accuracy_score(Y_test, Y_test_pred)
		
		# Record the results
		results.append({'feature': feature, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy})
	
	return results


In [40]:
results = evaluate_model_with_features(X_features, Y_labels, new_features, data)

results

[{'feature': 'test_Propoxyphene_0',
  'train_accuracy': 0.9642276422764228,
  'test_accuracy': 0.6493506493506493},
 {'feature': 'test_Amphetamines_0',
  'train_accuracy': 1.0,
  'test_accuracy': 0.6493506493506493},
 {'feature': 'test_Cannabinoids_0',
  'train_accuracy': 0.9967479674796748,
  'test_accuracy': 0.6363636363636364},
 {'feature': 'test_Benzodiazepines_0',
  'train_accuracy': 1.0,
  'test_accuracy': 0.6428571428571429},
 {'feature': 'test_MMethadone_0',
  'train_accuracy': 0.9934959349593496,
  'test_accuracy': 0.6298701298701299},
 {'feature': 'test_Oxycodone_0',
  'train_accuracy': 1.0,
  'test_accuracy': 0.6363636363636364},
 {'feature': 'test_Cocaine_0',
  'train_accuracy': 1.0,
  'test_accuracy': 0.6168831168831169},
 {'feature': 'test_Methamphetamine_0',
  'train_accuracy': 1.0,
  'test_accuracy': 0.6363636363636364},
 {'feature': 'test_Propoxyphene_1',
  'train_accuracy': 1.0,
  'test_accuracy': 0.6558441558441559},
 {'feature': 'test_Amphetamines_1',
  'train_accur

In [43]:
# format the results dict into a pandas DataFrame
results_df = pd.DataFrame(results)

results_df = results_df.sort_values(by='test_accuracy', ascending=False)

In [54]:
test_names = [
 'Propoxyphene',
 'Amphetamines',
 'Cannabinoids',
 'Benzodiazepines',
 'MMethadone',
 'Oxycodone',
 'Cocaine',
 'Methamphetamine',
 'Opiate300'
 ]
for test in test_names:
    globals()[test] = results_df[results_df['feature'].str.contains('test_'+test)].sort_index().sort_values('test_accuracy', ascending=False)
    # convert output of globals to markdown and display in ipython
    display(Markdown(globals()[test].to_markdown()))
    

|    | feature             |   train_accuracy |   test_accuracy |
|---:|:--------------------|-----------------:|----------------:|
| 16 | test_Propoxyphene_2 |         1        |        0.675325 |
|  8 | test_Propoxyphene_1 |         1        |        0.655844 |
|  0 | test_Propoxyphene_0 |         0.964228 |        0.649351 |
| 24 | test_Propoxyphene_3 |         0.962602 |        0.642857 |
| 32 | test_Propoxyphene_4 |         0.995122 |        0.642857 |

|    | feature             |   train_accuracy |   test_accuracy |
|---:|:--------------------|-----------------:|----------------:|
| 33 | test_Amphetamines_4 |         1        |        0.668831 |
|  9 | test_Amphetamines_1 |         0.949593 |        0.662338 |
| 17 | test_Amphetamines_2 |         0.957724 |        0.655844 |
|  1 | test_Amphetamines_0 |         1        |        0.649351 |
| 25 | test_Amphetamines_3 |         1        |        0.636364 |

|    | feature             |   train_accuracy |   test_accuracy |
|---:|:--------------------|-----------------:|----------------:|
| 34 | test_Cannabinoids_4 |         1        |        0.668831 |
| 18 | test_Cannabinoids_2 |         0.95935  |        0.642857 |
|  2 | test_Cannabinoids_0 |         0.996748 |        0.636364 |
| 10 | test_Cannabinoids_1 |         0.962602 |        0.623377 |
| 26 | test_Cannabinoids_3 |         0.996748 |        0.623377 |

|    | feature                |   train_accuracy |   test_accuracy |
|---:|:-----------------------|-----------------:|----------------:|
| 19 | test_Benzodiazepines_2 |         0.95935  |        0.655844 |
| 27 | test_Benzodiazepines_3 |         0.954472 |        0.649351 |
|  3 | test_Benzodiazepines_0 |         1        |        0.642857 |
| 35 | test_Benzodiazepines_4 |         1        |        0.642857 |
| 11 | test_Benzodiazepines_1 |         0.96748  |        0.636364 |

|    | feature           |   train_accuracy |   test_accuracy |
|---:|:------------------|-----------------:|----------------:|
| 36 | test_MMethadone_4 |         0.99187  |        0.655844 |
| 20 | test_MMethadone_2 |         0.993496 |        0.642857 |
|  4 | test_MMethadone_0 |         0.993496 |        0.62987  |
| 28 | test_MMethadone_3 |         0.960976 |        0.62987  |
| 12 | test_MMethadone_1 |         1        |        0.623377 |

|    | feature          |   train_accuracy |   test_accuracy |
|---:|:-----------------|-----------------:|----------------:|
|  5 | test_Oxycodone_0 |         1        |        0.636364 |
| 37 | test_Oxycodone_4 |         1        |        0.636364 |
| 21 | test_Oxycodone_2 |         1        |        0.62987  |
| 13 | test_Oxycodone_1 |         1        |        0.623377 |
| 29 | test_Oxycodone_3 |         0.995122 |        0.61039  |

|    | feature        |   train_accuracy |   test_accuracy |
|---:|:---------------|-----------------:|----------------:|
| 14 | test_Cocaine_1 |         0.957724 |        0.642857 |
| 30 | test_Cocaine_3 |         0.957724 |        0.642857 |
| 38 | test_Cocaine_4 |         1        |        0.636364 |
| 22 | test_Cocaine_2 |         0.95935  |        0.62987  |
|  6 | test_Cocaine_0 |         1        |        0.616883 |

|    | feature                |   train_accuracy |   test_accuracy |
|---:|:-----------------------|-----------------:|----------------:|
| 23 | test_Methamphetamine_2 |         0.952846 |        0.675325 |
| 31 | test_Methamphetamine_3 |         0.993496 |        0.655844 |
|  7 | test_Methamphetamine_0 |         1        |        0.636364 |
| 15 | test_Methamphetamine_1 |         0.99187  |        0.636364 |
| 39 | test_Methamphetamine_4 |         0.960976 |        0.62987  |

| feature   | train_accuracy   | test_accuracy   |
|-----------|------------------|-----------------|

In [63]:
import pandas as pd

# Assuming results_df and test_names are defined as per your previous context

# Step 1: Initialize an empty list to store the first row of each DataFrame
first_rows = []

# Step 2: Iterate through the test names
for test in test_names:
	# Assuming each DataFrame is stored in globals() as per your code snippet
	df = globals()[test]
	if not df.empty:
		# Step 3: Append the first row to the list
		first_rows.append(df.iloc[0])

# Step 4: Create a new DataFrame from the list of first rows
new_df = pd.concat(first_rows, axis=1).transpose()

# Display the new DataFrame
top_tests = new_df

In [64]:
survey_names = [
'cannabis',
'cocaine',
'alcohol',
'oxycodone',
'mmethadone',
'amphetamine',
'methamphetamine',
'opiates',
'benzodiazepines'
]

for survey in survey_names:
    globals()[survey] = results_df[results_df['feature'].str.contains(survey)].sort_index().sort_values('test_accuracy', ascending=False)
    # convert output of globals to markdown and display in ipython
    display(Markdown(globals()[survey].to_markdown()))

|    | feature           |   train_accuracy |   test_accuracy |
|---:|:------------------|-----------------:|----------------:|
| 48 | survey_cannabis_4 |         0.95935  |        0.642857 |
| 40 | survey_cannabis_0 |         0.996748 |        0.636364 |

|    | feature          |   train_accuracy |   test_accuracy |
|---:|:-----------------|-----------------:|----------------:|
| 49 | survey_cocaine_4 |         0.957724 |        0.636364 |
| 41 | survey_cocaine_0 |         1        |        0.623377 |

|    | feature          |   train_accuracy |   test_accuracy |
|---:|:-----------------|-----------------:|----------------:|
| 42 | survey_alcohol_0 |         0.962602 |        0.62987  |
| 50 | survey_alcohol_4 |         1        |        0.623377 |

|    | feature            |   train_accuracy |   test_accuracy |
|---:|:-------------------|-----------------:|----------------:|
| 43 | survey_oxycodone_0 |                1 |        0.642857 |
| 51 | survey_oxycodone_4 |                1 |        0.636364 |

|    | feature             |   train_accuracy |   test_accuracy |
|---:|:--------------------|-----------------:|----------------:|
| 44 | survey_mmethadone_0 |                1 |        0.642857 |
| 52 | survey_mmethadone_4 |                1 |        0.642857 |

|    | feature                  |   train_accuracy |   test_accuracy |
|---:|:-------------------------|-----------------:|----------------:|
| 23 | test_Methamphetamine_2   |         0.952846 |        0.675325 |
| 45 | survey_amphetamine_0     |         1        |        0.668831 |
| 31 | test_Methamphetamine_3   |         0.993496 |        0.655844 |
| 53 | survey_amphetamine_4     |         0.95935  |        0.642857 |
|  7 | test_Methamphetamine_0   |         1        |        0.636364 |
| 15 | test_Methamphetamine_1   |         0.99187  |        0.636364 |
| 54 | survey_methamphetamine_4 |         1        |        0.636364 |
| 39 | test_Methamphetamine_4   |         0.960976 |        0.62987  |
| 46 | survey_methamphetamine_0 |         0.954472 |        0.623377 |

|    | feature                  |   train_accuracy |   test_accuracy |
|---:|:-------------------------|-----------------:|----------------:|
| 54 | survey_methamphetamine_4 |         1        |        0.636364 |
| 46 | survey_methamphetamine_0 |         0.954472 |        0.623377 |

| feature   | train_accuracy   | test_accuracy   |
|-----------|------------------|-----------------|

|    | feature                  |   train_accuracy |   test_accuracy |
|---:|:-------------------------|-----------------:|----------------:|
| 55 | survey_benzodiazepines_4 |         1        |        0.642857 |
| 47 | survey_benzodiazepines_0 |         0.962602 |        0.636364 |

In [65]:
import pandas as pd

# Assuming results_df and test_names are defined as per your previous context

# Step 1: Initialize an empty list to store the first row of each DataFrame
first_rows = []

# Step 2: Iterate through the test names
for survey in survey_names:
	# Assuming each DataFrame is stored in globals() as per your code snippet
	df = globals()[survey]
	if not df.empty:
		# Step 3: Append the first row to the list
		first_rows.append(df.iloc[0])

# Step 4: Create a new DataFrame from the list of first rows
new_df = pd.concat(first_rows, axis=1).transpose()

# Display the new DataFrame
top_surveys = new_df

In [70]:
# concat the top_surveys and top_tests
top_features = pd.concat([top_surveys, top_tests])

top_features = top_features.sort_index()

top_features.to_csv('../data/top_features.csv', index=False)

top_features.shape

(16, 3)