In [55]:
# Sampling libraries
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.model_selection import train_test_split


# SalesInsightsPreprocessing
from PreprocessingScript import *

# Set print options
pd.set_option('display.max_columns', None)

df = mergedHospitalPharmacyDfYearAndSortedWithTypes

# Separate the rows where Type is 0 and 1
df_type_0 = df[df['Type'] == 0]
df_type_1 = df[df['Type'] == 1]


#### Define features/independent variables 'X', and specify our target/dependent variable, y

In [60]:
# Below, we make a list of features/independent variables 'X', and specify our target/dependent variable, y
# The model will guess/predict the 'y' feature (our target) based on the list of features, 'X'
# Running the cell will not produce any output. This is because we are defining X and y, which we will be using in the next section to train our model

X = df.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).values
y = df['Volume'].values

X_type_0 = df_type_0.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).values
y_type_0 = df_type_0['Volume'].values


In [61]:
# split data into test and train - 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_type_0, X_test_type_0, y_train_type_0, y_test_type_0 = train_test_split(X_type_0, y_type_0, test_size=0.2, random_state=42)

In [52]:
df

Unnamed: 0,Account Description ID,Type,Account Description,Size,Size Numeric,Strength (mg),WHO ATC 5 Code,Volume,Value,Year Month (after 2000),Year Month (after 2000) in Datetime,Region_Hovedstaden,Region_Midtjylland,Region_Nordjylland,Region_Sjælland,Region_Syddanmark,Product_Cimzia,Product_Inflectra,Product_Remicade,Product_Remsima,Product_Stelara,Product_Zessly
0,0,0,3061 Aalborg ø Fyrkilden,1 eng. spr. a 1 ml,1.0,90.0,0,2.0,51475.62,1508.0,2015-08,0,0,1,0,0,0,0,0,0,1,0
1,0,0,3061 Aalborg ø Fyrkilden,1 eng. spr. a 1 ml,1.0,90.0,0,2.0,51475.62,1510.0,2015-10,0,0,1,0,0,0,0,0,0,1,0
2,0,0,3061 Aalborg ø Fyrkilden,1 stk. (0.5 ml),1.0,45.0,0,1.0,25737.81,1511.0,2015-11,0,0,1,0,0,0,0,0,0,1,0
3,0,0,3061 Aalborg ø Fyrkilden,1 eng. spr. a 1 ml,1.0,90.0,0,1.0,25737.81,1512.0,2015-12,0,0,1,0,0,0,0,0,0,1,0
4,0,0,3061 Aalborg ø Fyrkilden,1 eng. spr. a 1 ml,1.0,90.0,0,1.0,25737.81,1602.0,2016-02,0,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6871,257,1,4398 Viborg Sygehus. apoteket,1 htgl.,1.0,100.0,1,472.0,2028009.36,1310.0,2013-10,0,1,0,0,0,0,0,1,0,0,0
6872,257,1,4398 Viborg Sygehus. apoteket,2 stk.,2.0,200.0,2,45.0,354665.70,1311.0,2013-11,0,1,0,0,0,1,0,0,0,0,0
6873,257,1,4398 Viborg Sygehus. apoteket,1 htgl.,1.0,100.0,1,410.0,1761618.30,1311.0,2013-11,0,1,0,0,0,0,0,1,0,0,0
6874,257,1,4398 Viborg Sygehus. apoteket,2 stk.,2.0,200.0,2,52.0,409835.92,1312.0,2013-12,0,1,0,0,0,1,0,0,0,0,0


#### Oversample hospital values

In [53]:
# L04AC05 = 0
# Hospital = 0
# Pharmacy = 1

df_C05 = df[df['WHO ATC 5 Code'] == 0]

df_c05_hospitals = df_C05[df_C05['Type'] == 0]
df_c05_pharmacies = df_C05[df_C05['Type'] == 1]

print('Total rows of L04AC05:', len(df_C05))
print('Total rows of L04AC05 in hospitals:', len(df_c05_hospitals))
print('Total rows of L04AC05 in pharmacies:', len(df_c05_pharmacies))

Total rows of L04AC05: 3505
Total rows of L04AC05 in hospitals: 500
Total rows of L04AC05 in pharmacies: 3005


In [64]:
print("Count before:", Counter(y_train_type_0))
ranOveSam = RandomOverSampler(sampling_strategy='minority')
X_type_0, y_type_0 = ranOveSam.fit_resample(X_train_type_0, y_train_type_0)
print("Count after:", Counter(y_type_0))

Count before: Counter({np.float64(1.0): 41, np.float64(2.0): 30, np.float64(10.0): 29, np.float64(8.0): 27, np.float64(384.0): 27, np.float64(7.0): 24, np.float64(3.0): 22, np.float64(4.0): 21, np.float64(480.0): 20, np.float64(5.0): 18, np.float64(11.0): 16, np.float64(30.0): 16, np.float64(6.0): 16, np.float64(14.0): 15, np.float64(576.0): 15, np.float64(20.0): 15, np.float64(16.0): 14, np.float64(12.0): 14, np.float64(40.0): 14, np.float64(26.0): 13, np.float64(22.0): 13, np.float64(21.0): 13, np.float64(23.0): 13, np.float64(15.0): 12, np.float64(13.0): 11, np.float64(17.0): 10, np.float64(18.0): 10, np.float64(25.0): 10, np.float64(50.0): 10, np.float64(28.0): 9, np.float64(9.0): 9, np.float64(19.0): 9, np.float64(288.0): 9, np.float64(24.0): 8, np.float64(48.0): 8, np.float64(1260.0): 8, np.float64(60.0): 7, np.float64(37.0): 7, np.float64(70.0): 7, np.float64(34.0): 6, np.float64(51.0): 6, np.float64(42.0): 5, np.float64(1470.0): 5, np.float64(58.0): 5, np.float64(41.0): 5, np.f

In [18]:
# L04AB02 = 1
# Hospital = 0
# Pharmacy = 1

df_B02 = df[df['WHO ATC 5 Code'] == 'L04AB02']

df_b02_hospitals = df_B02[df_B02['Type'] == 'Hospital']
df_b02_pharmacies = df_B02[df_B02['Type'] == 'Pharmacy']

print('Total rows of L04AB02:', len(df_B02))
print('Total rows of L04AB02 in hospitals:', len(df_b02_hospitals))
print('Total rows of L04AB02 in pharmacies:', len(df_b02_pharmacies))

Total rows of L04AB02: 2126
Total rows of L04AB02 in hospitals: 356
Total rows of L04AB02 in pharmacies: 1770


In [20]:
# L04AB05 = 2
# Hospital = 0
# Pharmacy = 1

df_B05 = df[df['WHO ATC 5 Code'] == 'L04AB05']

df_b05_hospitals = df_B05[df_B05['Type'] == 'Hospital']
df_b05_pharmacies = df_B05[df_B05['Type'] == 'Pharmacy']

print('Total rows of L04AB05:', len(df_B05))
print('Total rows of L04AB05 in hospitals:', len(df_b05_hospitals))
print('Total rows of L04AB05 in pharmacies:', len(df_b05_pharmacies))

Total rows of L04AB05: 1245
Total rows of L04AB05 in hospitals: 228
Total rows of L04AB05 in pharmacies: 1017
