In [None]:
import os
import warnings
import datetime as dt
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly_express as px
import pmdarima as pm
import vaex as vx
from dateutil import parser
from matplotlib import dates as mpl_dates
from scipy.signal import savgol_filter
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller, arma_order_select_ic
from st_app.data_structure import dtypes
# from tensorflow.keras.layers import LSTM, Dense, Flatten
# univariate lstm example
# from tensorflow.keras.models import Sequential

warnings.filterwarnings('ignore')

%matplotlib widget
%load_ext line_profiler


In [None]:
plt.rcParams.update({
    "lines.color": "white",
    "patch.edgecolor": "white",
    "text.color": "black",
    "axes.facecolor": "white",
    "axes.edgecolor": "lightgray",
    "axes.labelcolor": "white",
    "xtick.color": "white",
    "ytick.color": "white",
    "grid.color": "lightgray",
    "figure.facecolor": "black",
    "figure.edgecolor": "black",
    "savefig.facecolor": "black",
    "savefig.edgecolor": "black"})

plt.style.use('dark_background')


In [None]:
path = os.getcwd()

# Imports data


def parse_dates(x):
    return dt.datetime.strptime(x, '%d/%m/%Y')


# You can try both to check speed of import
try:
    data = vx.open('%s//data//bigdata.hdf5' % path)
except (FileNotFoundError):
    data = vx.read_csv('%s//data//consumption_data.csv' % path, dtype=dtypes,
                       parse_dates=['Sale Date'], date_parser=parse_dates,
                       chunk_size=500_000, convert='%s//data//bigdata.hdf5' % path)


In [None]:
# Stripping Facility Names of Spaces and filtering only active facilities
data['Sale Facility'] = data['Sale Facility'].apply(lambda x: x.strip()).values

# Drop All Missing Values
data = data.dropmissing(['Product Name', 'Sale Facility'])


In [129]:
# Import VFL_CHECK
vfl_check = pd.read_csv('%s//data//vfl_check.csv' % path)
vfl_check['DISPENSATION VFL'] = vfl_check['DISPENSATION VFL'].apply(
    lambda x: x.strip())
vfl_list = vfl_check['DISPENSATION VFL'].tolist()
vfl_vx = vx.from_pandas(vfl_check)

# Import VDL Check
vdl_check = pd.read_csv('%s//data//vdl_check.csv' % path, encoding='cp1252', dtype={"Product ID":"str"})
vdl_check.drop_duplicates(['Product Description'], inplace=True)
vdl_check['Product Description'] = vdl_check['Product Description'].apply(
    lambda x: x.strip())
vdl_check['Product ID'] = vdl_check['Product ID'].apply(
    lambda x: str(x.strip('GH-')))
vdl_check['Corresponding Quantity'] = vdl_check['Corresponding Quantity'].apply(
    lambda x: int(re.findall(r'\d+', x)[0]))
vdl_vx = vx.from_pandas(vdl_check)


In [None]:
def new_assign(vx_df, columns):
    for i in columns:
        vx_df[i] = vx_df[i].values.data


In [None]:
# Filtering all active facilities
filtered_data = data[data['Sale Facility'].isin(vfl_list)]
filtered_data = filtered_data.extract()


In [None]:
# Join the VDL Check Data with Filtered Data on Product IDs to get Categories
filtered_data.join(vdl_vx, how='left', left_on='Vdl Drug ID',
                   right_on='Product ID', inplace=True)


# Assigning the array data of the VDl Categories to the VDL Categories column since its a masked_array
new_assign(filtered_data, vdl_check.columns.tolist())

# Dropping unnecessary data after the join
filtered_data.drop(['Product ID'], inplace=True)

# Column for Packs Sold
filtered_data['Packs_Sold'] = (
    filtered_data['Sum of Quantity In Units'] / filtered_data['Corresponding Quantity']).values


In [None]:
# Joining vfl data wtih filtered_data to get facility locations
filtered_data.join(vfl_vx, how='left', left_on='Sale Facility',
                   right_on='DISPENSATION VFL', inplace=True)
# Dropping unnecessary data columns
filtered_data.drop(['SC VFL', 'DISPENSATION VFL'], inplace=True)


In [None]:
def col_rename(data):
    # Place all Column Names in a List
    col_names = data.get_column_names()
    # Renaming all columns to fit Vaex standards
    for i in col_names:
        new_name = i.replace(" ", "_").replace("-", "_")
        data.rename(i, new_name)
        col_names[col_names.index(i)] = new_name


col_rename(filtered_data)


In [None]:
df = filtered_data.extract()
df.export_csv("data\cleaned_data.csv")


**Stock Balance Data Analysis**


In [155]:
# Importing Stock Balance Data
stk_balance = vx.read_csv('%s//data//stock_balance.csv' % path, dtype={
                          "Sum of stock balance in units": "float32", "Sum of stock_balance_in_packs": "float32", 
                          'product id (vdl)':'str'})

# Dropping Unwanted Columns
stk_balance.drop(columns=['inventory_date'], inplace=True)


#,facility_name,product id (vdl),product_name,Sum of stock balance in units,Sum of stock_balance_in_packs
0,CENTRIXMAS PHARMACY,106365,Ambesyl 10mg Tablets x30,30.0,1.0
1,CENTRIXMAS PHARMACY,106373,Atacand 32mg Tablets x28,56.0,2.0
2,CENTRIXMAS PHARMACY,106374,Atacand 8mg Tablets x28,56.0,2.0
3,CENTRIXMAS PHARMACY,106377,Atacand Plus 32mg/25mg Tablets x28,56.0,2.0
4,CENTRIXMAS PHARMACY,106378,Atenolol (Exeter) 50mg Tablets x28,28.0,1.0
...,...,...,...,...,...
168007,mPharma Kumasi Delivery Pharmacy,215040,Domi-10 Suppository x5,50.0,10.0
168008,mPharma Kumasi Delivery Pharmacy,240967,Se'Clear Eye Drops 10ml x1,3.0,3.0
168009,mPharma Kumasi Delivery Pharmacy,298342,zymax 500mg tablet x500,30.0,0.0
168010,mPharma Kumasi Delivery Pharmacy,315679,Co-amoksiklav (Vega) 228mg/5ml Susp x1,5.0,5.0


In [156]:
# Filtering out only active facilities
stk_balance = stk_balance[stk_balance['facility_name'].isin(vfl_list)]

In [157]:
grouped_stk = stk_balance.groupby(by=['facility_name', 'product id (vdl)', 'product_name']).agg({
    'Sum of stock balance in units': "sum"})
grouped_stk = grouped_stk[grouped_stk['Sum of stock balance in units'] > 0]
grouped_stk.rename('product id (vdl)', 'Product ID')

'Product ID'

In [158]:
grouped_stk.join(vdl_vx, how='left', left_on='Product ID',
                   right_on='Product ID', rsuffix='_sc', inplace=True)
grouped_stk.drop("Product ID_sc", inplace=True)

#,facility_name,Product ID,product_name,Sum of stock balance in units,Product Description,Form,Strength,Brand / Proprietary Name,Manufacturer,OTC/POM,Tier,VDL Sub Category,Corresponding Quantity
0,Immaculate Pharmacy Ltd,106815,Symbicort 160mcg/4.5mcg (60 Doses) Turbuhaler x1,9.0,SYMBICORT TURBUHALER 160/4.5MCG (60 DOSE,Inhaler,160mcg/4.5mcg,SYMBICORT,AstraZeneca,POM,Tier 1,Anti-Asthma,1
1,Minimax Pharmacy,234448,Ascoryl 125ml Syrup x1,47.0,Ascoryl 125ml Syrup x1,Syrup,1,ASCORYL SYR 125ML,M & G Pharmaceuticals,OTC,Tier 4,Cough & Flu,1
2,Fresh Spring Chemists Ltd,222663,Sinus Headache PE (Allegiant Health) Capsules x1,16.0,SINUS HEADACHE PE CAPS x1,Caplet,325MG/5MG,SINUS HEADACHE PE CAPS,Allegiant Health,OTC,Tier 2,Analgesics,1
3,Immaculate Pharmacy Ltd,186799,Irish Spring Bar x1,10.0,Irish Spring Bar x1,Soap,1,IRISH SPRING,Colgate-Palmolive,OTC,Tier 4,FMCG,1
4,UNIHAM PHARMACY,109112,Proximexa 500mg Tablets x10,800.0,Proximexa 500mg Tablets x10,Tablet,500MG,PROXIMEXA,GSK,POM,Tier 1,Anti-Infectives,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51247,Pesca Pharmacy,165682,Ginsomin Eve (30 Doses) Softgel Capsules x1,3.0,GINSOMIN EVE x1,Capsule,1,GINSOMIN EVE,Mega,OTC,Tier 3,Supplements,1
51248,Jags Pharmacy,106801,Nexium 20mg Tablet x14,126.0,Nexium 20mg Tablet x14,Capsule,20MG,NEXIUM,AstraZeneca,POM,Tier 1,Gastro-Intestinal,14
51249,Jags Pharmacy,108560,Bioferon (30 Doses) Capsules x1,8.0,Bioferon (30 Doses) Capsules x1,Capsule,1,BIOFERON,Medreich,OTC,Tier 1,Supplements,30
51250,Jags Pharmacy,162622,Luex Baby Cough 100ml Syrup x1,20.0,Luex Baby Cough 100ml Syrup x1,Syrup,1,LUEX BABY x1,Luex,OTC,Tier 3,Cough & Flu,1


In [159]:
grouped_stk['Stock Balance'] = grouped_stk['Sum of stock balance in units'] / grouped_stk['Corresponding Quantity']

In [160]:
grouped_stk = grouped_stk.extract()
grouped_stk.export_csv("data//cleaned_stock_balance.csv")

**Category Based Analysis**


In [None]:
Loc_Cat_df = filtered_data.groupby(by=['VDL_Sub_Category'], agg={
    'Pack(s)_Sold': 'sum'}).sort(by='Pack(s)_Sold', ascending=False)


In [None]:
location = filtered_data['LOCATION'].unique()


In [None]:
px.line(data_frame=Loc_Cat_df, x='Sale_Date', y='Pack(s)_Sold')


In [None]:
df = px.data.tips()
fig = px.scatter(df, x="total_bill", y="tip", color="sex", symbol="smoker", facet_col="time",
                 labels={"sex": "Gender", "smoker": "Smokes"})
fig.show()
