## Input Data Cleaning and Drug Similarity Calculation

In [32]:
import pandas as pd
import numpy as np


def CleanData(file_path):
    # Reading the file

    # Load the data from the CSV file
    data = pd.read_csv(file_path)

    # Removing the specified columns and reordering the 'Item Number – 8 digit' column
    columns_to_remove = ['Item Number – 6 digit', 'UPC Number', 'Constant', 
                        'Customer-Specific Item Number', 'Pack Size Divisor', 
                        'RX/OTC Indicator']

    # Removing the columns
    data_cleaned = data.drop(columns=columns_to_remove)

    # Reordering 'Item Number – 8 digit' to the left
    column_to_move = data_cleaned.pop('Item Number – 8 digit')
    data_cleaned.insert(0, 'Item Number – 8 digit', column_to_move)

    # Moving all price columns and the contract flag to the right
    columns_to_move = ['AWP Price', 'Acquisition Price', 'Retail Price', 'WAC Price', 'Contract Flag']
    for col in columns_to_move:
        data_cleaned[col] = data_cleaned.pop(col)

    import re

    # Function to split the generic description into generic name and form
    def split_description(desc):
        match = re.search(r'[A-Z]', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Generic Description'
    data_cleaned['Generic Name'], data_cleaned['Form'] = zip(*data_cleaned['Generic Description'].apply(split_description))
    data_cleaned.drop(columns=['Generic Description'], inplace=True)

    # Removing rows where 'Generic Name' is empty or whitespace
    data_cleaned = data_cleaned[data_cleaned['Generic Name'].str.strip() != '']

    # Function to split the description into name and size
    def split_description_on_number(desc):
        match = re.search(r'\d', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Description'
    data_cleaned['Name'], data_cleaned['Size'] = zip(*data_cleaned['Description'].apply(split_description_on_number))
    data_cleaned.drop(columns=['Description'], inplace=True)

    data_cleaned
    return data_cleaned

In [33]:
def ExactDrugAlgoFunction(drug_code, data):
    # Item number of the drug to run the similarity test on
    reference_item_number = drug_code

    # Find and print the row for the given reference_item_number
    matching_row = data[data['Item Number – 8 digit'] == reference_item_number]
    

    # Find the Generic Name for the given reference_item_number
    reference_generic_name = data.loc[data['Item Number – 8 digit'] == reference_item_number, 'Generic Name'].iloc[0]


    # Create a copy of the dataframe filtered by Generic Name
    data_generic = data[data['Generic Name'] == reference_generic_name].copy()

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    # Selecting the reference item
    reference_item = data_generic[data_generic['Item Number – 8 digit'] == reference_item_number]
    if reference_item.empty:
        return "Reference item not found in the dataset."

    # Extracting the form of the reference item
    reference_form = reference_item.iloc[0]['Form']
    forms = data_generic['Form'].tolist()
    forms.insert(0, reference_form)

    # Vectorizing the forms using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(forms)

    # Calculating cosine similarity
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    data_generic['Similarity'] = cosine_similarities

    # Filtering the dataset to show only items with a similarity score above 0.9
    similarity_items = data_generic[data_generic['Similarity'] > 0].copy()

    # Extracting the size of the reference item
    reference_size = reference_item.iloc[0]['Size']
    sizes = similarity_items['Size'].tolist()
    sizes.insert(0, reference_size)

    # Vectorizing the sizes using TF-IDF
    tfidf_matrix_sizes = vectorizer.fit_transform(sizes)

    # Calculating cosine similarity for sizes
    cosine_similarities_sizes = cosine_similarity(tfidf_matrix_sizes[0:1], tfidf_matrix_sizes[1:]).flatten()
    similarity_items['Size Similarity'] = cosine_similarities_sizes

    # Remove input item
    similarity_items = similarity_items[similarity_items['Item Number – 8 digit'] != reference_item_number]

    # Define true similarity
    w1 = 1
    w2 = 1
    similarity_items['True Similarity'] = (w1 * similarity_items['Similarity'] + w2 * similarity_items['Size Similarity']) / (w1 + w2)

    # Cleaning and sort the data
    similarity_items = similarity_items.drop(columns=['Similarity', 'Size Similarity'])
    similarity_items = similarity_items.sort_values(by=['True Similarity'], ascending=False)
    
    
    return similarity_items, matching_row

In [47]:
inputData = CleanData('Daily Snapshot.csv')
data, input = ExactDrugAlgoFunction(10004888, inputData)

input

# data.head(10)

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
836,10004888,68084031901,100,1,2304,1406,0,1920,C,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 100 UD


In [48]:
data.head(20)

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,True Similarity
38914,10059898,51672413101,100,1,51420,575,0,700,C,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 100,0.848545
31969,10049166,68382000801,100,1,51940,384,0,2311,C,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 100,0.848545
26027,10185319,62332003831,100,1,47374,601,0,650,N,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 100,0.848545
4780,10112023,65862022801,100,1,47512,601,0,650,N,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 100,0.848545
2795,10100298,13668004701,100,1,47500,647,0,700,N,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 100,0.848545
3636,10014951,173064255,100,1,218185,166197,0,181821,C,lamotrigine,ORAL TABLET 100 MG,LAMICTAL,100 MG TAB 100,0.848545
38391,10233874,904700861,100,1,1662,907,0,1385,C,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 100 (10,0.736504
39508,10060987,68084031801,100,1,2230,1316,0,1858,C,lamotrigine,ORAL TABLET 25 MG,LAMOTRIGINE,25 MG TAB 100 UD A,0.726193
35542,10221590,29300011205,500,1,238864,1770,0,3500,C,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 500,0.697201
29822,10276848,72888002605,500,1,237500,1631,0,1766,N,lamotrigine,ORAL TABLET 100 MG,LAMOTRIGINE,100 MG TAB 500,0.697201


### Exract Dosage Information

In [49]:
import re


# Adjust function to extract dosage information including MG, %, and ML
def extractDosage(size_str):
    # Ensure the input is a string
    size_str = str(size_str)
    # Initialize a dictionary to hold the extracted values
    extracted_values = {'MG': None, '%': None, 'ML': None, 'GM': None, 'MCG': None, 'M': None, 'OZ': None, 'IU': None, 'MEQ': None, 'UN': None, 'MM': None, 'HR': None, 'MMOL': None, 'KG': None, 'BP': None, 'L': None, 'CM': None, 'CC': None, 'CAL': None, 'LB': None, 'IN': None, 'GR': None, 'GAL': None, 'LT': None, 'USP': None, 'G': None, 'MU': None}

    # Turn string to uppercase for case-insensitive matching
    size_str = size_str.upper()

    # Change 'X' to ' * ' for easier matching
    size_str = size_str.replace('X', ' * ')

    # Change '/' to ' / ' for easier matching
    size_str = size_str.replace('/', ' / ')
    
    # Simplified logic for MG, %, ML extraction
    # Looks for a number (with optional decimal) immediately before the unit, with optional space
    for unit in ['MG', '%', 'ML', 'GM', 'MCG', 'OZ', 'IU', 'MEQ', 'UN', 'MM', 'HR', 'MMOL', 'KG', 'BP', 'L', 'CM', 'CC', 'CAL', 'LB', 'IN', 'GR', 'GAL', 'LT', 'USP', 'MU']:
        pattern = r'(\d+(?:\.\d*)?)\s*{}'.format(unit)
        match = re.search(pattern, size_str, re.IGNORECASE)
        if match and 'X' not in match.group(1):
            # Convert matched value to float and assign to the correct unit
            extracted_values[unit] = float(match.group(1))

    # If no MG, ML, MCG, MEQ, MM, MMOL, MU, GM, GR, GAL are not found, check for M and G
    if extracted_values['MG'] is None and extracted_values['ML'] is None and extracted_values['MCG'] is None and extracted_values['MEQ'] is None and extracted_values['MM'] is None and extracted_values['MMOL'] is None and extracted_values['MU'] is None and extracted_values['GM'] is None and extracted_values['GR'] is None and extracted_values['GAL'] is None:
        # Check for M and G
        for unit in ['M', 'G']:
            pattern = r'(\d+(?:\.\d*)?)\s*{}'.format(unit)
            match = re.search(pattern, size_str, re.IGNORECASE)
            if match and 'X' not in match.group(1):
                # Convert matched value to float and assign to the correct unit
                extracted_values[unit] = float(match.group(1))
    
    return extracted_values


# Apply the adjusted function to extract all values
df_updated_extracted = inputData['Size'].apply(extractDosage)

# Update the DataFrame with the new extracted values
inputData['MG'] = df_updated_extracted.apply(lambda x: x['MG'])
inputData['%'] = df_updated_extracted.apply(lambda x: x['%'])
inputData['ML'] = df_updated_extracted.apply(lambda x: x['ML'])
inputData['GM'] = df_updated_extracted.apply(lambda x: x['GM'])
inputData['MCG'] = df_updated_extracted.apply(lambda x: x['MCG'])
inputData['M'] = df_updated_extracted.apply(lambda x: x['M'])
inputData['OZ'] = df_updated_extracted.apply(lambda x: x['OZ'])
inputData['IU'] = df_updated_extracted.apply(lambda x: x['IU'])
inputData['MEQ'] = df_updated_extracted.apply(lambda x: x['MEQ'])
inputData['UN'] = df_updated_extracted.apply(lambda x: x['UN'])
inputData['MM'] = df_updated_extracted.apply(lambda x: x['MM'])
inputData['HR'] = df_updated_extracted.apply(lambda x: x['HR'])
inputData['MMOL'] = df_updated_extracted.apply(lambda x: x['MMOL'])
inputData['KG'] = df_updated_extracted.apply(lambda x: x['KG'])
inputData['BP'] = df_updated_extracted.apply(lambda x: x['BP'])
inputData['L'] = df_updated_extracted.apply(lambda x: x['L'])
inputData['CM'] = df_updated_extracted.apply(lambda x: x['CM'])
inputData['CC'] = df_updated_extracted.apply(lambda x: x['CC'])
inputData['CAL'] = df_updated_extracted.apply(lambda x: x['CAL'])
inputData['LB'] = df_updated_extracted.apply(lambda x: x['LB'])
inputData['IN'] = df_updated_extracted.apply(lambda x: x['IN'])
inputData['GR'] = df_updated_extracted.apply(lambda x: x['GR'])
inputData['GAL'] = df_updated_extracted.apply(lambda x: x['GAL'])
inputData['LT'] = df_updated_extracted.apply(lambda x: x['LT'])
inputData['USP'] = df_updated_extracted.apply(lambda x: x['USP'])
inputData['G'] = df_updated_extracted.apply(lambda x: x['G'])
inputData['MU'] = df_updated_extracted.apply(lambda x: x['MU'])

# Display the updated DataFrame
inputData.to_csv('inputDataProcessed.csv', index=False)
inputData.head(10)

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,...,CC,CAL,LB,IN,GR,GAL,LT,USP,G,MU
0,10000009,51672408306,45,1,17928,9425,0,14342,C,hydrocortisone butyrate,...,,,,,,,,,,
1,10083412,23334807,,1,164760,137300,0,137300,N,dexamethasone,...,,,,,,,,,,
2,10000013,93227234,20,1,11751,5038,0,8814,C,amoxicillin/potassium clav,...,,,,,,,,,,
3,10083430,51991062033,30,1,40038,831,0,900,N,anastrozole,...,,,,,,,,,,
5,10083433,67457022005,6X5,1,763841,587712,0,636534,N,isosulfan blue,...,,,,,,,,,,
6,10000077,3161112,30,1,164698,130386,0,137248,N,entecavir,...,,,,,,,,,,
7,10000082,3161212,30,1,164698,130386,0,137248,N,entecavir,...,,,,,,,,,,
8,10083420,2324090,90,1,100980,77696,0,84150,N,duloxetine,...,,,,,,,,,,
9,10000095,65649050130,28.4,1,73516,56565,0,61263,N,hydrocortisone,...,,,,,,,,,,
10,10000096,193658621,100,1,1195,1096,1644,996,N,lancets,...,,,,,,,,,,


In [50]:
# Perform the conversions
inputData['Total_MG'] = (inputData['MG'].fillna(0) +
                         inputData['GM'].fillna(0) * 1000 +
                         inputData['KG'].fillna(0) * 1000000 +
                         inputData['OZ'].fillna(0) * 28349.5 +
                         inputData['LB'].fillna(0) * 453592 +
                         inputData['GR'].fillna(0) * 1000 + 
                         inputData['G'].fillna(0) * 1000)
inputData['Total_MG'] = inputData['Total_MG'].replace(0.0, np.nan)

inputData['Total_ML'] = (inputData['ML'].fillna(0) +
                         inputData['L'].fillna(0) * 1000 +
                         inputData['GAL'].fillna(0) * 3785.41 +
                         inputData['LT'].fillna(0) * 1000 +
                         inputData['CC'].fillna(0) * 1)
inputData['Total_ML'] = inputData['Total_ML'].replace(0.0, np.nan)

inputData['Total_MM'] = (inputData['MM'].fillna(0) +
                         inputData['CM'].fillna(0) * 10 +
                         inputData['IN'].fillna(0) * 25.4)
inputData['Total_MM'] = inputData['Total_MM'].replace(0.0, np.nan)

# List of columns to drop (all the original measurement columns)
cols_to_drop = ['MG', 'ML', 'GM', 'KG', 'OZ', 'LB', 'GR', 'L', 'GAL', 'LT', 'CC', 'MM', 'CM', 'IN', 'G']

# Drop the original measurement columns
inputData.drop(columns=cols_to_drop, inplace=True)

# Now inputData contains only the totalized columns and any other non-related columns
inputData.to_csv('inputDataProcessed.csv', index=False)
inputData.head(10)


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,...,UN,HR,MMOL,BP,CAL,USP,MU,Total_MG,Total_ML,Total_MM
0,10000009,51672408306,45,1,17928,9425,0,14342,C,hydrocortisone butyrate,...,,,,,,,,,,
1,10083412,23334807,,1,164760,137300,0,137300,N,dexamethasone,...,,,,,,,,,,
2,10000013,93227234,20,1,11751,5038,0,8814,C,amoxicillin/potassium clav,...,,,,,,,,57.0,,
3,10083430,51991062033,30,1,40038,831,0,900,N,anastrozole,...,,,,,,,,1.0,,
5,10083433,67457022005,6X5,1,763841,587712,0,636534,N,isosulfan blue,...,,,,,,,,,5.0,
6,10000077,3161112,30,1,164698,130386,0,137248,N,entecavir,...,,,,,,,,0.5,,
7,10000082,3161212,30,1,164698,130386,0,137248,N,entecavir,...,,,,,,,,1.0,,
8,10083420,2324090,90,1,100980,77696,0,84150,N,duloxetine,...,,,,,,,,30.0,,
9,10000095,65649050130,28.4,1,73516,56565,0,61263,N,hydrocortisone,...,,,,,,,,28400.0,,
10,10000096,193658621,100,1,1195,1096,1644,996,N,lancets,...,,,,,,,,,,


### Finalize Function

In [51]:
def addDosageInfo(inputData):
    def extractDosage(size_str):
        # Ensure the input is a string
        size_str = str(size_str)
        # Initialize a dictionary to hold the extracted values
        extracted_values = {'MG': None, '%': None, 'ML': None, 'GM': None, 'MCG': None, 'M': None, 'OZ': None, 'IU': None, 'MEQ': None, 'UN': None, 'MM': None, 'HR': None, 'MMOL': None, 'KG': None, 'BP': None, 'L': None, 'CM': None, 'CC': None, 'CAL': None, 'LB': None, 'IN': None, 'GR': None, 'GAL': None, 'LT': None, 'USP': None, 'G': None, 'MU': None}

        # Turn string to uppercase for case-insensitive matching
        size_str = size_str.upper()

        # Change 'X' to ' * ' for easier matching
        size_str = size_str.replace('X', ' * ')

        # Change '/' to ' / ' for easier matching
        size_str = size_str.replace('/', ' / ')
        
        # Simplified logic for MG, %, ML extraction
        # Looks for a number (with optional decimal) immediately before the unit, with optional space
        for unit in ['MG', '%', 'ML', 'GM', 'MCG', 'OZ', 'IU', 'MEQ', 'UN', 'MM', 'HR', 'MMOL', 'KG', 'BP', 'L', 'CM', 'CC', 'CAL', 'LB', 'IN', 'GR', 'GAL', 'LT', 'USP', 'MU']:
            pattern = r'(\d+(?:\.\d*)?)\s*{}'.format(unit)
            match = re.search(pattern, size_str, re.IGNORECASE)
            if match and 'X' not in match.group(1):
                # Convert matched value to float and assign to the correct unit
                extracted_values[unit] = float(match.group(1))

        # If no MG, ML, MCG, MEQ, MM, MMOL, MU, GM, GR, GAL are not found, check for M and G
        if extracted_values['MG'] is None and extracted_values['ML'] is None and extracted_values['MCG'] is None and extracted_values['MEQ'] is None and extracted_values['MM'] is None and extracted_values['MMOL'] is None and extracted_values['MU'] is None and extracted_values['GM'] is None and extracted_values['GR'] is None and extracted_values['GAL'] is None:
            # Check for M and G
            for unit in ['M', 'G']:
                pattern = r'(\d+(?:\.\d*)?)\s*{}'.format(unit)
                match = re.search(pattern, size_str, re.IGNORECASE)
                if match and 'X' not in match.group(1):
                    # Convert matched value to float and assign to the correct unit
                    extracted_values[unit] = float(match.group(1))
        
        return extracted_values

    # Apply the adjusted function to extract all values
    df_updated_extracted = inputData['Size'].apply(extractDosage)

    # Update the DataFrame with the new extracted values
    inputData['MG'] = df_updated_extracted.apply(lambda x: x['MG'])
    inputData['%'] = df_updated_extracted.apply(lambda x: x['%'])
    inputData['ML'] = df_updated_extracted.apply(lambda x: x['ML'])
    inputData['GM'] = df_updated_extracted.apply(lambda x: x['GM'])
    inputData['MCG'] = df_updated_extracted.apply(lambda x: x['MCG'])
    inputData['M'] = df_updated_extracted.apply(lambda x: x['M'])
    inputData['OZ'] = df_updated_extracted.apply(lambda x: x['OZ'])
    inputData['IU'] = df_updated_extracted.apply(lambda x: x['IU'])
    inputData['MEQ'] = df_updated_extracted.apply(lambda x: x['MEQ'])
    inputData['UN'] = df_updated_extracted.apply(lambda x: x['UN'])
    inputData['MM'] = df_updated_extracted.apply(lambda x: x['MM'])
    inputData['HR'] = df_updated_extracted.apply(lambda x: x['HR'])
    inputData['MMOL'] = df_updated_extracted.apply(lambda x: x['MMOL'])
    inputData['KG'] = df_updated_extracted.apply(lambda x: x['KG'])
    inputData['BP'] = df_updated_extracted.apply(lambda x: x['BP'])
    inputData['L'] = df_updated_extracted.apply(lambda x: x['L'])
    inputData['CM'] = df_updated_extracted.apply(lambda x: x['CM'])
    inputData['CC'] = df_updated_extracted.apply(lambda x: x['CC'])
    inputData['CAL'] = df_updated_extracted.apply(lambda x: x['CAL'])
    inputData['LB'] = df_updated_extracted.apply(lambda x: x['LB'])
    inputData['IN'] = df_updated_extracted.apply(lambda x: x['IN'])
    inputData['GR'] = df_updated_extracted.apply(lambda x: x['GR'])
    inputData['GAL'] = df_updated_extracted.apply(lambda x: x['GAL'])
    inputData['LT'] = df_updated_extracted.apply(lambda x: x['LT'])
    inputData['USP'] = df_updated_extracted.apply(lambda x: x['USP'])
    inputData['G'] = df_updated_extracted.apply(lambda x: x['G'])
    inputData['MU'] = df_updated_extracted.apply(lambda x: x['MU'])

    # Perform the conversions
    inputData['Total_MG'] = (inputData['MG'].fillna(0) +
                            inputData['GM'].fillna(0) * 1000 +
                            inputData['KG'].fillna(0) * 1000000 +
                            inputData['OZ'].fillna(0) * 28349.5 +
                            inputData['LB'].fillna(0) * 453592 +
                            inputData['GR'].fillna(0) * 1000 + 
                            inputData['G'].fillna(0) * 1000)
    inputData['Total_MG'] = inputData['Total_MG'].replace(0.0, np.nan)

    inputData['Total_ML'] = (inputData['ML'].fillna(0) +
                            inputData['L'].fillna(0) * 1000 +
                            inputData['GAL'].fillna(0) * 3785.41 +
                            inputData['LT'].fillna(0) * 1000 +
                            inputData['CC'].fillna(0) * 1)
    inputData['Total_ML'] = inputData['Total_ML'].replace(0.0, np.nan)

    inputData['Total_MM'] = (inputData['MM'].fillna(0) +
                            inputData['CM'].fillna(0) * 10 +
                            inputData['IN'].fillna(0) * 25.4)
    inputData['Total_MM'] = inputData['Total_MM'].replace(0.0, np.nan)

    # List of columns to drop (all the original measurement columns)
    cols_to_drop = ['MG', 'ML', 'GM', 'KG', 'OZ', 'LB', 'GR', 'L', 'GAL', 'LT', 'CC', 'MM', 'CM', 'IN', 'G']

    # Drop the original measurement columns
    inputData.drop(columns=cols_to_drop, inplace=True)

    # Now inputData contains only the totalized columns and any other non-related columns
    return inputData
    

## Produce replacements

### Determine Basic Form

In [52]:
# Remove items with a True Similarity less that 0.5
data = data[data['True Similarity'] > 0.5]

# Apply the function to the dataset
data = addDosageInfo(data)

In [53]:
# Define a function to classify the basic form based on the units present
def classify_basic_form(row):
    if pd.notna(row['Total_ML']):
        return 'Liquid'
    elif pd.notna(row['Total_MG']):
        return 'Solid'
    # Add more classifications as needed based on other units
    else:
        return 'Other/Undefined'

# Apply the classification function to each row
data['Basic Form'] = data.apply(classify_basic_form, axis=1)

# Display the updated DataFrame
data.head(10)


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,...,HR,MMOL,BP,CAL,USP,MU,Total_MG,Total_ML,Total_MM,Basic Form
38914,10059898,51672413101,100,1,51420,575,0,700,C,lamotrigine,...,,,,,,,100.0,,,Solid
31969,10049166,68382000801,100,1,51940,384,0,2311,C,lamotrigine,...,,,,,,,100.0,,,Solid
26027,10185319,62332003831,100,1,47374,601,0,650,N,lamotrigine,...,,,,,,,100.0,,,Solid
4780,10112023,65862022801,100,1,47512,601,0,650,N,lamotrigine,...,,,,,,,100.0,,,Solid
2795,10100298,13668004701,100,1,47500,647,0,700,N,lamotrigine,...,,,,,,,100.0,,,Solid
3636,10014951,173064255,100,1,218185,166197,0,181821,C,lamotrigine,...,,,,,,,100.0,,,Solid
38391,10233874,904700861,100,1,1662,907,0,1385,C,lamotrigine,...,,,,,,,100.0,,,Solid
39508,10060987,68084031801,100,1,2230,1316,0,1858,C,lamotrigine,...,,,,,,,25.0,,,Solid
35542,10221590,29300011205,500,1,238864,1770,0,3500,C,lamotrigine,...,,,,,,,100.0,,,Solid
29822,10276848,72888002605,500,1,237500,1631,0,1766,N,lamotrigine,...,,,,,,,100.0,,,Solid


In [54]:
input = addDosageInfo(input)

# Apply the classification function to each row
input['Basic Form'] = input.apply(classify_basic_form, axis=1)

input

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputData['MG'] = df_updated_extracted.apply(lambda x: x['MG'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputData['%'] = df_updated_extracted.apply(lambda x: x['%'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputData['ML'] = df_updated_extracted.apply(lambda x: x['ML'])
A value is tryi

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,...,HR,MMOL,BP,CAL,USP,MU,Total_MG,Total_ML,Total_MM,Basic Form
836,10004888,68084031901,100,1,2304,1406,0,1920,C,lamotrigine,...,,,,,,,100.0,,,Solid


### Solids

In [55]:
def isMultipleOf(num, multiple):
    return num % multiple == 0

In [56]:
if input['Basic Form'].iloc[0] == 'Solid' :
    print('Solid')
    solids_data = data[data['Basic Form'] == 'Solid']

    # Remove items with a Total MG bigger than the input Total MG
    solids_data = solids_data[solids_data['Total_MG'] <= input['Total_MG'].iloc[0]]

    # Remove items where the input Total MG is not a multiple of the item's Total MG
    solids_data = solids_data[solids_data['Total_MG'].apply(lambda x: isMultipleOf(input['Total_MG'].iloc[0], x))]

    result = solids_data
    

#solids_data.head(10)

Solid


In [57]:
# For all other forms with all other units (%,MCG,M,IU,MEQ,UN,HR,MMOL,BP,CAL,USP,MU,Total_ML,Total_MM,Total_MG(In combinations with others))
if input['Basic Form'].iloc[0] != 'Solid':
    print('Liquid')
    

    # Remove items where all units are not equal to the input units
    units = ['%', 'MCG', 'M', 'IU', 'MEQ', 'UN', 'HR', 'MMOL', 'BP', 'CAL', 'USP', 'MU', 'Total_ML', 'Total_MM', 'Total_MG']
    for unit in units:
        result = data[data[unit] == input[unit].iloc[0]]

result
    



    

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,...,HR,MMOL,BP,CAL,USP,MU,Total_MG,Total_ML,Total_MM,Basic Form
38914,10059898,51672413101,100,1,51420,575,0,700,C,lamotrigine,...,,,,,,,100.0,,,Solid
31969,10049166,68382000801,100,1,51940,384,0,2311,C,lamotrigine,...,,,,,,,100.0,,,Solid
26027,10185319,62332003831,100,1,47374,601,0,650,N,lamotrigine,...,,,,,,,100.0,,,Solid
4780,10112023,65862022801,100,1,47512,601,0,650,N,lamotrigine,...,,,,,,,100.0,,,Solid
2795,10100298,13668004701,100,1,47500,647,0,700,N,lamotrigine,...,,,,,,,100.0,,,Solid
3636,10014951,173064255,100,1,218185,166197,0,181821,C,lamotrigine,...,,,,,,,100.0,,,Solid
38391,10233874,904700861,100,1,1662,907,0,1385,C,lamotrigine,...,,,,,,,100.0,,,Solid
39508,10060987,68084031801,100,1,2230,1316,0,1858,C,lamotrigine,...,,,,,,,25.0,,,Solid
35542,10221590,29300011205,500,1,238864,1770,0,3500,C,lamotrigine,...,,,,,,,100.0,,,Solid
29822,10276848,72888002605,500,1,237500,1631,0,1766,N,lamotrigine,...,,,,,,,100.0,,,Solid


### Finalize Function

In [58]:
def getReplacements(input, data, isMultiple = False):
    units = ['%', 'MCG', 'M', 'IU', 'MEQ', 'UN', 'HR', 'MMOL', 'BP', 'CAL', 'USP', 'MU', 'Total_ML', 'Total_MM', 'Total_MG']
    # Define a function to classify the basic form based on the units present
    def classify_basic_form(row):
        if pd.notna(row['Total_MG']) and pd.isna(row[units]).all():
            return 'Solid'
        # Add more classifications as needed based on other units
        else:
            return 'Other/Undefined'
        
    def isMultipleOf(num, multiple):
        return num % multiple == 0
    
    # Remove items with a True Similarity less that 0.5
    data = data[data['True Similarity'] > 0.5]

    # Apply the function to the dataset
    data = addDosageInfo(data)

    # Add dosage info to the input
    input = addDosageInfo(input)

    # Apply the classification function to each row
    data['Basic Form'] = data.apply(classify_basic_form, axis=1)

    # Apply the classification function to intput
    input['Basic Form'] = input.apply(classify_basic_form, axis=1)

    if input['Basic Form'].iloc[0] == 'Solid' and isMultiple:
        print('Solid')
        solids_data = data[data['Basic Form'] == 'Solid']

        # Remove items with a Total MG bigger than the input Total MG
        solids_data = solids_data[solids_data['Total_MG'] <= input['Total_MG'].iloc[0]]

        # Remove items where the input Total MG is not a multiple of the item's Total MG
        solids_data = solids_data[solids_data['Total_MG'].apply(lambda x: isMultipleOf(input['Total_MG'].iloc[0], x))]

        result = solids_data
    else:
        print('Not Solid or rule off')

        # Remove items where all units are not equal to the input units
        units = ['%', 'MCG', 'M', 'IU', 'MEQ', 'UN', 'HR', 'MMOL', 'BP', 'CAL', 'USP', 'MU', 'Total_ML', 'Total_MM', 'Total_MG']
        for unit in units:
            result = data[data[unit] == input[unit].iloc[0]]

    return result

In [59]:
# Test the function
getReplacements(input, data)

Not Solid or rule off


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputData['MG'] = df_updated_extracted.apply(lambda x: x['MG'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputData['%'] = df_updated_extracted.apply(lambda x: x['%'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputData['ML'] = df_updated_extracted.apply(lambda x: x['ML'])
A value is tryi

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,...,HR,MMOL,BP,CAL,USP,MU,Total_MG,Total_ML,Total_MM,Basic Form
38914,10059898,51672413101,100,1,51420,575,0,700,C,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
31969,10049166,68382000801,100,1,51940,384,0,2311,C,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
26027,10185319,62332003831,100,1,47374,601,0,650,N,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
4780,10112023,65862022801,100,1,47512,601,0,650,N,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
2795,10100298,13668004701,100,1,47500,647,0,700,N,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
3636,10014951,173064255,100,1,218185,166197,0,181821,C,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
38391,10233874,904700861,100,1,1662,907,0,1385,C,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
35542,10221590,29300011205,500,1,238864,1770,0,3500,C,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
29822,10276848,72888002605,500,1,237500,1631,0,1766,N,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
4515,10016172,68382000810,1000,1,519400,3826,0,23110,C,lamotrigine,...,,,,,,,100.0,,,Other/Undefined
