In [112]:
# Sampling libraries
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample


# SalesInsightsPreprocessing
from PreprocessingScript import *

# Set print options
pd.set_option('display.max_columns', None)

df = mergedHospitalPharmacyDfYearAndSortedWithTypes

# Separate the rows where Type is 0 and 1
df_type_0 = df[df['Type'] == 0]
df_type_1 = df[df['Type'] == 1]


#### Define features/independent variables 'X', and specify our target/dependent variable, y

In [124]:
# Below, we make a list of features/independent variables 'X', and specify our target/dependent variable, y
# The model will guess/predict the 'y' feature (our target) based on the list of features, 'X'
# Running the cell will not produce any output. This is because we are defining X and y, which we will be using in the next section to train our model

X = df.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).values
y = df['Volume'].values

X_type_0 = df_type_0.values
y_type_0 = df_type_0['Volume'].values


In [125]:
# split data into test and train - 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_type_0, X_test_type_0, y_train_type_0, y_test_type_0 = train_test_split(X_type_0, y_type_0, test_size=0.2, random_state=42)

#### Oversample hospital values

In [126]:
# oversampling for df_type_0
ros = RandomOverSampler(random_state=42)
X_over, y_over = ros.fit_resample(X_train_type_0, y_train_type_0)

In [130]:
# make the resampled data into a dataframe
hospital_df_resampled = pd.DataFrame(X_over, columns=df.columns)

print(len(hospital_df_resampled))
print(len(df_type_1))

oversampled_df = pd.concat([hospital_df_resampled, df_type_1], ignore_index=True)

6752
5811


#### Add more random values to df and oversampled_df

In [133]:
# Define the number of rows to generate
n_samples = 20000

# Generate additional rows for df and oversampled_df
df_resampled = resample(df, replace=True, n_samples=n_samples, random_state=42)
oversampled_df_resampled = resample(oversampled_df, replace=True, n_samples=n_samples, random_state=42)

# Concatenate the resampled dataframes
df_resampled = pd.concat([df, df_resampled], ignore_index=True)
oversampled_df_resampled = pd.concat([oversampled_df, oversampled_df_resampled], ignore_index=True)


#### Show all dataframes after sampling etc

In [135]:
# L04AC05 = 0
# Hospital = 0
# Pharmacy = 1

df_C05 = df[df['WHO ATC 5 Code'] == 0]
df_over_C05 = oversampled_df[oversampled_df['WHO ATC 5 Code'] == 0]
df_res_C05 = df_resampled[df_resampled['WHO ATC 5 Code'] == 0]
df_over_res_C05 = oversampled_df_resampled[oversampled_df_resampled['WHO ATC 5 Code'] == 0]

df_c05_hospitals = df_C05[df_C05['Type'] == 0]
df_c05_pharmacies = df_C05[df_C05['Type'] == 1]

df_over_c05_hospitals = df_over_C05[df_over_C05['Type'] == 0]
df_over_c05_pharmacies = df_over_C05[df_over_C05['Type'] == 1]

df_res_c05_hospitals = df_res_C05[df_res_C05['Type'] == 0]
df_res_c05_pharmacies = df_res_C05[df_res_C05['Type'] == 1]

df_over_res_c05_hospitals = df_over_res_C05[df_over_res_C05['Type'] == 0]
df_over_res_c05_pharmacies = df_over_res_C05[df_over_res_C05['Type'] == 1]


print('df:')
print('Total rows of L04AC05:', len(df_C05))
print('Total rows of L04AC05 in hospitals:', len(df_c05_hospitals))
print('Total rows of L04AC05 in pharmacies:', len(df_c05_pharmacies))
print()
print('oversampled df:')
print('Total rows of L04AC05:', len(df_over_C05))
print('Total rows of L04AC05 in hospitals:', len(df_over_c05_hospitals))
print('Total rows of L04AC05 in pharmacies:', len(df_over_c05_pharmacies))
print()
print('resampled df:')
print('Total rows of L04AC05:', len(df_res_C05))
print('Total rows of L04AC05 in hospitals:', len(df_res_c05_hospitals))
print('Total rows of L04AC05 in pharmacies:', len(df_res_c05_pharmacies))
print()
print('oversampled resampled df:')
print('Total rows of L04AC05:', len(df_over_res_C05))
print('Total rows of L04AC05 in hospitals:', len(df_over_res_c05_hospitals))
print('Total rows of L04AC05 in pharmacies:', len(df_over_res_c05_pharmacies))


df:
Total rows of L04AC05: 2126
Total rows of L04AC05 in hospitals: 356
Total rows of L04AC05 in pharmacies: 1770

oversampled df:
Total rows of L04AC05: 5386
Total rows of L04AC05 in hospitals: 3616
Total rows of L04AC05 in pharmacies: 1770

resampled df:
Total rows of L04AC05: 8264
Total rows of L04AC05 in hospitals: 1407
Total rows of L04AC05 in pharmacies: 6857

oversampled resampled df:
Total rows of L04AC05: 13981
Total rows of L04AC05 in hospitals: 9405
Total rows of L04AC05 in pharmacies: 4576


In [136]:
# L04AB02 = 1
# Hospital = 0
# Pharmacy = 1

df_B02 = df[df['WHO ATC 5 Code'] == 1]
df_over_B02 = oversampled_df[oversampled_df['WHO ATC 5 Code'] == 1]
df_res_B02 = df_resampled[df_resampled['WHO ATC 5 Code'] == 1]
df_over_res_B02 = oversampled_df_resampled[oversampled_df_resampled['WHO ATC 5 Code'] == 1]

df_b02_hospitals = df_B02[df_B02['Type'] == 0]
df_b02_pharmacies = df_B02[df_B02['Type'] == 1]

df_over_b02_hospitals = df_over_B02[df_over_B02['Type'] == 0]
df_over_b02_pharmacies = df_over_B02[df_over_B02['Type'] == 1]

df_res_b02_hospitals = df_res_B02[df_res_B02['Type'] == 0]
df_res_b02_pharmacies = df_res_B02[df_res_B02['Type'] == 1]

df_over_res_b02_hospitals = df_over_res_B02[df_over_res_B02['Type'] == 0]
df_over_res_b02_pharmacies = df_over_res_B02[df_over_res_B02['Type'] == 1]

print('df:')
print('Total rows of L04AB02:', len(df_B02))
print('Total rows of L04AB02 in hospitals:', len(df_b02_hospitals))
print('Total rows of L04AB02 in pharmacies:', len(df_b02_pharmacies))
print()
print('oversampled df:')
print('Total rows of L04AB02:', len(df_over_B02))
print('Total rows of L04AB02 in hospitals:', len(df_over_b02_hospitals))
print('Total rows of L04AB02 in pharmacies:', len(df_over_b02_pharmacies))
print()
print('resampled df:')
print('Total rows of L04AB02:', len(df_res_B02))
print('Total rows of L04AB02 in hospitals:', len(df_res_b02_hospitals))
print('Total rows of L04AB02 in pharmacies:', len(df_res_b02_pharmacies))
print()
print('oversampled resampled df:')
print('Total rows of L04AB02:', len(df_over_res_B02))
print('Total rows of L04AB02 in hospitals:', len(df_over_res_b02_hospitals))
print('Total rows of L04AB02 in pharmacies:', len(df_over_res_b02_pharmacies))

df:
Total rows of L04AB02: 1245
Total rows of L04AB02 in hospitals: 228
Total rows of L04AB02 in pharmacies: 1017

oversampled df:
Total rows of L04AB02: 3022
Total rows of L04AB02 in hospitals: 2005
Total rows of L04AB02 in pharmacies: 1017

resampled df:
Total rows of L04AB02: 4882
Total rows of L04AB02 in hospitals: 887
Total rows of L04AB02 in pharmacies: 3995

oversampled resampled df:
Total rows of L04AB02: 7798
Total rows of L04AB02 in hospitals: 5204
Total rows of L04AB02 in pharmacies: 2594


In [137]:
# L04AB05 = 2
# Hospital = 0
# Pharmacy = 1

df_B05 = df[df['WHO ATC 5 Code'] == 2]
df_over_B05 = oversampled_df[oversampled_df['WHO ATC 5 Code'] == 2]
df_res_B05 = df_resampled[df_resampled['WHO ATC 5 Code'] == 2]
df_over_res_B05 = oversampled_df_resampled[oversampled_df_resampled['WHO ATC 5 Code'] == 2]

df_b05_hospitals = df_B05[df_B05['Type'] == 0]
df_b05_pharmacies = df_B05[df_B05['Type'] == 1]

df_over_b05_hospitals = df_over_B05[df_over_B05['Type'] == 0]
df_over_b05_pharmacies = df_over_B05[df_over_B05['Type'] == 1]

df_res_b05_hospitals = df_res_B05[df_res_B05['Type'] == 0]
df_res_b05_pharmacies = df_res_B05[df_res_B05['Type'] == 1]

df_over_res_b05_hospitals = df_over_res_B05[df_over_res_B05['Type'] == 0]
df_over_res_b05_pharmacies = df_over_res_B05[df_over_res_B05['Type'] == 1]

print('df:')
print('Total rows of L04AB05:', len(df_B05))
print('Total rows of L04AB05 in hospitals:', len(df_b05_hospitals))
print('Total rows of L04AB05 in pharmacies:', len(df_b05_pharmacies))
print()
print('oversampled df:')
print('Total rows of L04AB05:', len(df_over_B05))
print('Total rows of L04AB05 in hospitals:', len(df_over_b05_hospitals))
print('Total rows of L04AB05 in pharmacies:', len(df_over_b05_pharmacies))
print()
print('resampled df:')
print('Total rows of L04AB05:', len(df_res_B05))
print('Total rows of L04AB05 in hospitals:', len(df_res_b05_hospitals))
print('Total rows of L04AB05 in pharmacies:', len(df_res_b05_pharmacies))
print()
print('oversampled resampled df:')
print('Total rows of L04AB05:', len(df_over_res_B05))
print('Total rows of L04AB05 in hospitals:', len(df_over_res_b05_hospitals))
print('Total rows of L04AB05 in pharmacies:', len(df_over_res_b05_pharmacies))

df:
Total rows of L04AB05: 3505
Total rows of L04AB05 in hospitals: 481
Total rows of L04AB05 in pharmacies: 3024

oversampled df:
Total rows of L04AB05: 4155
Total rows of L04AB05 in hospitals: 1131
Total rows of L04AB05 in pharmacies: 3024

resampled df:
Total rows of L04AB05: 13730
Total rows of L04AB05 in hospitals: 1860
Total rows of L04AB05 in pharmacies: 11870

oversampled resampled df:
Total rows of L04AB05: 10784
Total rows of L04AB05 in hospitals: 2948
Total rows of L04AB05 in pharmacies: 7836


#### Summarization of the dataframes we will use

In [138]:
# L04AC05
df_C05
df_over_C05
df_res_C05
df_over_res_C05

# L04AB02
df_B02
df_over_B02
df_res_B02
df_over_res_B02

# L04AB05
df_B05
df_over_B05
df_res_B05
df_over_res_B05

Unnamed: 0,Account Description ID,Type,Account Description,Size,Size Numeric,Strength (mg),WHO ATC 5 Code,Volume,Value,Year Month (after 2000),Year Month (after 2000) in Datetime,Region_Hovedstaden,Region_Midtjylland,Region_Nordjylland,Region_Sjælland,Region_Syddanmark,Product_Cimzia,Product_Inflectra,Product_Remicade,Product_Remsima,Product_Stelara,Product_Zessly
0,0,0,4070 Sjællands Universitetshospital. Roskilde,1 stk. (0.5 ml),1.0,45.0,2,3.0,81223.86,1111.0,2011-11,0,0,0,1,0,0,0,0,0,1,0
1,1,0,4120 Odense Universitetshospital,1 stk. (0.5 ml),1.0,45.0,2,24.0,587208.0,1708.0,2017-08,0,0,0,0,1,0,0,0,0,1,0
2,0,0,4070 Sjællands Universitetshospital. Roskilde,1 stk. (0.5 ml),1.0,45.0,2,6.0,162447.72,1104.0,2011-04,0,0,0,1,0,0,0,0,0,1,0
4,1,0,4120 Odense Universitetshospital,1 stk. (0.5 ml),1.0,45.0,2,21.0,540494.01,1509.0,2015-09,0,0,0,0,1,0,0,0,0,1,0
5,0,0,4070 Sjællands Universitetshospital. Roskilde,1 eng. spr. a 1 ml,1.0,90.0,2,24.0,602264.64,1612.0,2016-12,0,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32552,107,1,0140 Køge Torvets apotek,1 stk. (0.5 ml),1.0,45.0,2,1.0,25737.81,1509.0,2015-09,0,0,0,1,0,0,0,0,0,1,0
32555,0,0,4070 Sjællands Universitetshospital. Roskilde,1 eng. spr. a 1 ml,1.0,90.0,2,11.0,297820.82,1212.0,2012-12,0,0,0,1,0,0,0,0,0,1,0
32556,1,0,4120 Odense Universitetshospital,1 eng. spr. a 1 ml,1.0,90.0,2,32.0,782944.0,1705.0,2017-05,0,0,0,0,1,0,0,0,0,1,0
32557,200,1,2480 Hedensted apotek,1 stk. (0.5 ml),1.0,45.0,2,1.0,25737.81,1412.0,2014-12,0,1,0,0,0,0,0,0,0,1,0


In [139]:
X_train
y_train

X_over = oversampled_df.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).values
y_over = oversampled_df['Volume'].values

X_res = df_resampled.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).values
y_res = df_resampled['Volume'].values

X_res_over = oversampled_df_resampled.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).values
y_res_over = oversampled_df_resampled['Volume'].values