In [65]:
# Sampling libraries
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.model_selection import train_test_split


# SalesInsightsPreprocessing
from PreprocessingScript import *

# Set print options
pd.set_option('display.max_columns', None)

df = mergedHospitalPharmacyDfYearAndSortedWithTypes

# Separate the rows where Type is 0 and 1
df_type_0 = df[df['Type'] == 0]
df_type_1 = df[df['Type'] == 1]


#### Define features/independent variables 'X', and specify our target/dependent variable, y

In [66]:
# Below, we make a list of features/independent variables 'X', and specify our target/dependent variable, y
# The model will guess/predict the 'y' feature (our target) based on the list of features, 'X'
# Running the cell will not produce any output. This is because we are defining X and y, which we will be using in the next section to train our model

X = df.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).values
y = df['Volume'].values

X_type_0 = df_type_0.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).values
y_type_0 = df_type_0['Volume'].values


In [67]:
# split data into test and train - 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_type_0, X_test_type_0, y_train_type_0, y_test_type_0 = train_test_split(X_type_0, y_type_0, test_size=0.2, random_state=42)

#### Oversample hospital values

In [68]:
# oversampling for df_type_0
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_type_0, y_train_type_0)

In [69]:
# make the resampled data into a dataframe
hospital_df_resampled = pd.DataFrame(X_resampled, columns=df.drop(['Volume', 'Account Description', 'Size', 'Year Month (after 2000) in Datetime', 'Value'], axis=1).columns)

print(len(hospital_df_resampled))
print(len(df_type_1))

oversampled_df = pd.concat([hospital_df_resampled, df_type_1], ignore_index=True)

6752
5811


#### Add more random values to df

#### Show all dataframes after sampling etc

In [83]:
# L04AC05 = 0
# Hospital = 0
# Pharmacy = 1

df_C05 = df[df['WHO ATC 5 Code'] == 0]
df_over_C05 = oversampled_df[oversampled_df['WHO ATC 5 Code'] == 0]

df_over_c05_hospitals = df_over_C05[df_over_C05['Type'] == 0]
df_over_c05_pharmacies = df_over_C05[df_over_C05['Type'] == 1]
df_c05_hospitals = df_C05[df_C05['Type'] == 0]
df_c05_pharmacies = df_C05[df_C05['Type'] == 1]


print('og df:')
print('Total rows of L04AC05:', len(df_C05))
print('Total rows of L04AC05 in hospitals:', len(df_c05_hospitals))
print('Total rows of L04AC05 in pharmacies:', len(df_c05_pharmacies))
print()
print('oversampled df:')
print('Total rows of L04AC05:', len(df_over_C05))
print('Total rows of L04AC05 in hospitals:', len(df_over_c05_hospitals))
print('Total rows of L04AC05 in pharmacies:', len(df_over_c05_pharmacies))


og df:
Total rows of L04AC05: 2126
Total rows of L04AC05 in hospitals: 356
Total rows of L04AC05 in pharmacies: 1770

oversampled df:
Total rows of L04AC05: 5386
Total rows of L04AC05 in hospitals: 3616
Total rows of L04AC05 in pharmacies: 1770


In [84]:
# L04AB02 = 1
# Hospital = 0
# Pharmacy = 1

df_B02 = df[df['WHO ATC 5 Code'] == 1]
df_over_B02 = oversampled_df[oversampled_df['WHO ATC 5 Code'] == 1]

df_over_b02_hospitals = df_over_B02[df_over_B02['Type'] == 0]
df_over_b02_pharmacies = df_over_B02[df_over_B02['Type'] == 1]
df_b02_hospitals = df_B02[df_B02['Type'] == 0]
df_b02_pharmacies = df_B02[df_B02['Type'] == 1]

print('og df:')
print('Total rows of L04AB02:', len(df_B02))
print('Total rows of L04AB02 in hospitals:', len(df_b02_hospitals))
print('Total rows of L04AB02 in pharmacies:', len(df_b02_pharmacies))
print()
print('oversampled df:')
print('Total rows of L04AB02:', len(df_over_B02))
print('Total rows of L04AB02 in hospitals:', len(df_over_b02_hospitals))
print('Total rows of L04AB02 in pharmacies:', len(df_over_b02_pharmacies))

og df:
Total rows of L04AB02: 1245
Total rows of L04AB02 in hospitals: 228
Total rows of L04AB02 in pharmacies: 1017

oversampled df:
Total rows of L04AB02: 3022
Total rows of L04AB02 in hospitals: 2005
Total rows of L04AB02 in pharmacies: 1017


In [85]:
# L04AB05 = 2
# Hospital = 0
# Pharmacy = 1

df_B05 = df[df['WHO ATC 5 Code'] == 2]
df_over_B05 = oversampled_df[oversampled_df['WHO ATC 5 Code'] == 2]

df_over_b05_hospitals = df_over_B05[df_over_B05['Type'] == 0]
df_over_b05_pharmacies = df_over_B05[df_over_B05['Type'] == 1]
df_b05_hospitals = df_B05[df_B05['Type'] == 0]
df_b05_pharmacies = df_B05[df_B05['Type'] == 1]

print('og df:')
print('Total rows of L04AB05:', len(df_B05))
print('Total rows of L04AB05 in hospitals:', len(df_b05_hospitals))
print('Total rows of L04AB05 in pharmacies:', len(df_b05_pharmacies))
print()
print('oversampled df:')
print('Total rows of L04AB05:', len(df_over_B05))
print('Total rows of L04AB05 in hospitals:', len(df_over_b05_hospitals))
print('Total rows of L04AB05 in pharmacies:', len(df_over_b05_pharmacies))

og df:
Total rows of L04AB05: 3505
Total rows of L04AB05 in hospitals: 481
Total rows of L04AB05 in pharmacies: 3024

oversampled df:
Total rows of L04AB05: 4155
Total rows of L04AB05 in hospitals: 1131
Total rows of L04AB05 in pharmacies: 3024
