In [60]:
import pandas as pd
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

from src.helpers import load_csv


FILENAME = "drug_sex_values.csv"
filepath = f"../data/{FILENAME}"

In [61]:
drug_sex_df = load_csv(filepath)
drug_sex_df = drug_sex_df.drop(columns=['all opioids', 'stimulants', 'cannabis', 'benzodiazepine', 'value'])

# linear regression equation
$drug_{count}$ = $\beta_0$ + $\beta_1*Time$ + $\beta_2*Sex$ + $\beta_3*Setting$ + $\epsilon$

In [62]:
drug_sex_df.head(3)

Unnamed: 0,sex,time,start_time,end_time,setting,all drugs
0,female,1,01/01/2020,01/31/2020,ip,4812.0
1,female,1,01/01/2020,01/31/2020,ed,18839.0
2,male,1,01/01/2020,01/31/2020,ip,5482.0


# One-hot encoding the categorical variables 'sex' and 'setting'

In [63]:
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_features = encoder.fit_transform(drug_sex_df[['sex', 'setting']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['sex', 'setting']))

# Concatenate the encoded features with the original dataframe
data_encoded = pd.concat([drug_sex_df, encoded_df], axis=1)

# Drop the original categorical columns and other columns not needed for regression
data_encoded = data_encoded.drop(columns=['sex', 'setting', 'start_time', 'end_time'])



# Splitting the data into training and testing sets (80% train, 20% test)


In [64]:
X = data_encoded.drop('all drugs', axis=1)
y = data_encoded['all drugs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()


Unnamed: 0,time,sex_male,setting_ip
84,22,0.0,1.0
2,1,1.0,1.0
94,24,1.0,1.0
45,12,0.0,0.0
42,11,1.0,1.0


# Multiple Linear Regression Model

In [65]:
# Initialize the Linear Regression model
lr = LinearRegression()

# Train the model on the training data
lr.fit(X_train, y_train)

# Predictions on the test set
y_pred = lr.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

mae, mse, rmse

(844.9198045679931, 1497236.4165746025, 1223.6161230445612)

# Extracting the coefficients and intercept

In [66]:
coefficients = lr.coef_
intercept = lr.intercept_

# Creating a DataFrame for better visualization
coeff_df = pd.DataFrame(coefficients, X_train.columns, columns=['Coefficient'])

coeff_df, intercept

(             Coefficient
 time          -31.777578
 sex_male      245.820863
 setting_ip -10216.630061,
 15145.311128886573)

# linear regression equation with coeffecients:
$drug_{count}$ = $\beta_0$ + $-31.78*Time$ + $245.82*Sex$ + $-10216.63*Setting$ + $\epsilon$