In [None]:
import os
import pandas as pd
import network_util

In [None]:
columns = "D,E,L,Q"
reimbursed_medicines_path = os.path.join("data", "Refundowane.xlsx")
reimbursed_medicines_df = pd.read_excel(
    reimbursed_medicines_path, sheet_name="A1", skiprows=1, usecols=columns, decimal=","
)
reimbursed_medicines_df = reimbursed_medicines_df.rename(
    columns={
        "Zawartość opakowania": "Package",
        "Numer GTIN lub inny kod jednoznacznie identyfikujący produkt": "GTIN",
        "Cena detaliczna": "Price",
        "Wysokość dopłaty świadczeniobiorcy": "Supplement",
    }
)
reimbursed_medicines_df["Price"] = reimbursed_medicines_df["Price"].astype(float)
reimbursed_medicines_df["Supplement"] = reimbursed_medicines_df["Supplement"].astype(
    float
)
reimbursed_medicines_df["Refund_amount"] = (
    reimbursed_medicines_df["Price"] - reimbursed_medicines_df["Supplement"]
)
reimbursed_medicines_df["Package"] = reimbursed_medicines_df["Package"].str.extract(r"(\d+)").astype(int)
reimbursed_medicines_df.drop(["Price", "Supplement"], axis=1, inplace=True)
display(reimbursed_medicines_df)

In [None]:
rpl_path = os.path.join("data", "RPL_20240102.csv")
cols_to_drop=[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 17, 18, 19, 20, 21, 22, 23, 24, 25]

rpl_df = pd.read_csv(rpl_path, sep=";")
rpl_df.drop(rpl_df.columns[cols_to_drop], inplace=True, axis=1)
rpl_df.dropna(subset=["Substancja czynna", "Podmiot odpowiedzialny"], inplace=True)
display(rpl_df)

In [None]:
rpl_df["Opakowanie"] = rpl_df["Opakowanie"].str.replace("\n", " ")
rpl_df["GTIN"] = rpl_df["Opakowanie"].str.extractall(r"(\d{12,14}|\d{8})").groupby(level=0)[0].apply(list)
rpl_df = rpl_df.explode("GTIN")
rpl_df.drop("Opakowanie", axis=1, inplace=True)
rpl_df.dropna(subset=["GTIN"], inplace=True)
rpl_df["GTIN"] = rpl_df["GTIN"].astype(int)

In [None]:
merged_df = pd.merge(reimbursed_medicines_df, rpl_df, how="inner", left_on="GTIN", right_on="GTIN")
merged_df = merged_df.rename(
    columns={
        "Postać farmaceutyczna": "Form",
        "Podmiot odpowiedzialny": "Responsible_entity",
        "Substancja czynna": "Substance",
        "Nazwa wytwórcy": "Producer",
    }
)
merged_df["Producer"] = merged_df["Producer"].str.replace("\n", " ")

In [None]:
merged_df["Refund_per_unit"] = merged_df["Refund_amount"] / merged_df["Package"]
merged_df["Mean_refund"] = merged_df.groupby(["Form", "Responsible_entity", "Substance"])["Refund_per_unit"].transform("mean")
merged_df.drop(["Refund_amount", "Refund_per_unit", "Package"], axis=1, inplace=True)
display(merged_df)

In [None]:
merged_df.drop_duplicates(["Form", "Responsible_entity", "Substance", "Producer", "Mean_refund"], inplace=True)
display(merged_df)

In [None]:
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
G = network_util.create_bipartite_graph(merged_df["Substance"], merged_df['Responsible_entity'], merged_df["Mean_refund"])
pos = network_util.create_bipartite_layout(merged_df["Substance"], merged_df['Responsible_entity'])

In [None]:
plt.figure(figsize=(12, 20))
nx.draw(G, pos=pos, with_labels=True, font_weight='bold', node_color='skyblue', font_size=4)
plt.show()