In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from src.helpers import load_csv


FILENAME = "drug_sex_values.csv"
filepath = f"../data/{FILENAME}"

In [28]:
drug_sex_df = load_csv(filepath)
drug_sex_df = drug_sex_df.drop(columns=['all opioids', 'stimulants', 'cannabis', 'benzodiazepine'])

# linear regression equation
$drug_{count} = \beta_0 + \beta_1*Time + \beta_2*Sex + \beta_3*Setting + \epsilon$

In [29]:
drug_sex_df.head(3)

Unnamed: 0,sex,time,start_time,end_time,setting,all drugs
0,female,1,2020-01-01,2020-01-31,In Patient,4812.0
1,female,1,2020-01-01,2020-01-31,Emergency Department,18839.0
2,male,1,2020-01-01,2020-01-31,In Patient,5482.0


# One-hot encoding the categorical variables 'sex' and 'setting'

In [30]:
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(drug_sex_df[['sex', 'setting']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['sex', 'setting']))

# Concatenate the encoded features with the original dataframe
data_encoded = pd.concat([drug_sex_df, encoded_df], axis=1)

# Drop the original categorical columns and other columns not needed for regression
data_encoded = data_encoded.drop(columns=['sex', 'setting', 'start_time', 'end_time'])



# Splitting the data into training and testing sets (80% train, 20% test)


In [31]:
X = data_encoded.drop('all drugs', axis=1)
y = data_encoded['all drugs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()


Unnamed: 0,time,sex_male,setting_In Patient
84,22,0.0,1.0
2,1,1.0,1.0
94,24,1.0,1.0
45,12,0.0,0.0
42,11,1.0,1.0


# Multiple Linear Regression Model

In [40]:
# Initialize the Linear Regression model
lr = LinearRegression()

# Train the model on the training data
lr.fit(X_train, y_train)

# Predictions on the test set
y_pred = lr.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Multiple Linear regression mae: {mae:.2f}, mse: {mse:.2f}, rsme: {rmse:.2f}")

Multiple Linear regression mae: 844.92, mse: 1497236.42, rsme: 1223.62


# Extracting the coefficients and intercept

In [33]:
coefficients = lr.coef_
intercept = lr.intercept_

# Creating a DataFrame for better visualization
coeff_df = pd.DataFrame(coefficients, X_train.columns, columns=['Coefficient'])

coeff_df, intercept

(                     Coefficient
 time                  -31.777578
 sex_male              245.820863
 setting_In Patient -10216.630061,
 15145.311128886573)

# linear regression equation with coeffecients:
$drug_{count} = \beta_0 + (-31.78*Time) + (245.82*Sex) + (-10216.63*Setting) + \epsilon$

# Random Forest

In [None]:
drug_sex_df.head(3)

Unnamed: 0,sex,time,start_time,end_time,setting,all drugs,all_drugs
0,female,1,2020-01-01,2020-01-31,In Patient,4812.0,4812.0
1,female,1,2020-01-01,2020-01-31,Emergency Department,18839.0,18839.0
2,male,1,2020-01-01,2020-01-31,In Patient,5482.0,5482.0


In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf.fit(X_train, y_train)

# Predictions on the test set
y_pred_rf = rf.predict(X_test)

# Evaluate the model's performance
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

print(f"Random Forest Regression mae: {mae_rf:.2f}, mse: {mse_rf:.2f}, rsme: {rmse_rf:.2f}")

Random Forest Regression mae: 419.11, mse: 370593.38, rsme: 608.76


In [37]:
# Extract feature importance from the trained Random Forest model
feature_importance = rf.feature_importances_

# Create a DataFrame with features and their importances
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})

# Sort the DataFrame by importance values
feature_importance_sorted = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_sorted)

              Feature  Importance
2  setting_In Patient    0.928435
0                time    0.067411
1            sex_male    0.004155
