In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

FILENAME = "drug_sex_values.csv"
filepath = f"../data/{FILENAME}"

drug_sex_df = pd.read_csv(filepath)
drug_sex_df.head()

Unnamed: 0,sex,time,start_time,end_time,setting,all drugs,all opioids,stimulants,cannabis,benzodiazepine
0,female,1,2020-01-01,2020-01-31,In Patient,4812.0,583.0,230.0,303.0,91.0
1,female,1,2020-01-01,2020-01-31,Emergency Department,18839.0,767.0,580.0,1116.0,151.0
2,male,1,2020-01-01,2020-01-31,In Patient,5482.0,778.0,537.0,446.0,154.0
3,male,1,2020-01-01,2020-01-31,Emergency Department,18367.0,1304.0,1181.0,1641.0,291.0
4,female,2,2020-02-01,2020-02-29,In Patient,4659.0,630.0,236.0,280.0,99.0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

# Assuming df is your DataFrame

# First we'll need to convert 'setting' to numeric because sklearn requires numeric inputs
# We can use LabelEncoder for this
encoder = LabelEncoder()
drug_sex_df['setting_encoded'] = encoder.fit_transform(drug_sex_df['setting'])

# Assign the features to X and the target to y
X = drug_sex_df[['all drugs']]
y = drug_sex_df['setting_encoded']

# Split the data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

# Initialize the Random Forest Classifier and fit the model
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Print the accuracy score and the confusion matrix
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy Score:  1.0
Confusion Matrix: 
 [[14  0]
 [ 0 19]]


Once we have trained ou model, we can use it to predict the `setting` for new unseen `all drugs` data. Let's say we have a new data point for `all drugs` and we want to predict whether the setting would be "In Patient" or "Emergency Department": 

In [10]:
# mock a high score as ed data
# this is because it will most likely be emergency department
mock_ed_data = [[7000]]

# mock a low score as ip data
# this is because it will most likely be in patient
mock_ip_data = [[2500]]

# make a prediction
new_pred_ed = clf.predict(mock_ed_data)
new_pred_ip = clf.predict(mock_ip_data)

# print the predicted setting, remember to transform back the label into original one
print('Predicted setting (most likely ed): ', encoder.inverse_transform(new_pred_ed))
print('Predicted setting (most likely ip): ', encoder.inverse_transform(new_pred_ip))

Predicted setting (most likely ip):  ['In Patient']
Predicted setting (most likely ed):  ['Emergency Department']




 the Random Forest model only gives you a predicted outcome based on the patterns it learned from the training data. It does not take into account any causal relationships between the features and the target, and the prediction accuracy can greatly vary depending on the quality and representativeness of your training data.
 