## Input Data Cleaning and Drug Similarity Calculation

In [9]:
import pandas as pd
import numpy as np


def CleanData(file_path):
    # Reading the file

    # Load the data from the CSV file
    data = pd.read_csv(file_path)

    # Removing the specified columns and reordering the 'Item Number – 8 digit' column
    columns_to_remove = ['Item Number – 6 digit', 'UPC Number', 'Constant', 
                        'Customer-Specific Item Number', 'Pack Size Divisor', 
                        'RX/OTC Indicator']

    # Removing the columns
    data_cleaned = data.drop(columns=columns_to_remove)

    # Reordering 'Item Number – 8 digit' to the left
    column_to_move = data_cleaned.pop('Item Number – 8 digit')
    data_cleaned.insert(0, 'Item Number – 8 digit', column_to_move)

    # Moving all price columns and the contract flag to the right
    columns_to_move = ['AWP Price', 'Acquisition Price', 'Retail Price', 'WAC Price', 'Contract Flag']
    for col in columns_to_move:
        data_cleaned[col] = data_cleaned.pop(col)

    import re

    # Function to split the generic description into generic name and form
    def split_description(desc):
        match = re.search(r'[A-Z]', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Generic Description'
    data_cleaned['Generic Name'], data_cleaned['Form'] = zip(*data_cleaned['Generic Description'].apply(split_description))
    data_cleaned.drop(columns=['Generic Description'], inplace=True)

    # Removing rows where 'Generic Name' is empty or whitespace
    data_cleaned = data_cleaned[data_cleaned['Generic Name'].str.strip() != '']

    # Function to split the description into name and size
    def split_description_on_number(desc):
        match = re.search(r'\d', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Description'
    data_cleaned['Name'], data_cleaned['Size'] = zip(*data_cleaned['Description'].apply(split_description_on_number))
    data_cleaned.drop(columns=['Description'], inplace=True)

    data_cleaned
    return data_cleaned

In [10]:
def ExactDrugAlgoFunction(drug_code, data):
    # Item number of the drug to run the similarity test on
    reference_item_number = drug_code

    # Find and print the row for the given reference_item_number
    matching_row = data[data['Item Number – 8 digit'] == reference_item_number]
    

    # Find the Generic Name for the given reference_item_number
    reference_generic_name = data.loc[data['Item Number – 8 digit'] == reference_item_number, 'Generic Name'].iloc[0]


    # Create a copy of the dataframe filtered by Generic Name
    data_generic = data[data['Generic Name'] == reference_generic_name].copy()

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    # Selecting the reference item
    reference_item = data_generic[data_generic['Item Number – 8 digit'] == reference_item_number]
    if reference_item.empty:
        return "Reference item not found in the dataset."

    # Extracting the form of the reference item
    reference_form = reference_item.iloc[0]['Form']
    forms = data_generic['Form'].tolist()
    forms.insert(0, reference_form)

    # Vectorizing the forms using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(forms)

    # Calculating cosine similarity
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    data_generic['Similarity'] = cosine_similarities

    # Filtering the dataset to show only items with a similarity score above 0.9
    similarity_items = data_generic[data_generic['Similarity'] > 0].copy()

    # Extracting the size of the reference item
    reference_size = reference_item.iloc[0]['Size']
    sizes = similarity_items['Size'].tolist()
    sizes.insert(0, reference_size)

    # Vectorizing the sizes using TF-IDF
    tfidf_matrix_sizes = vectorizer.fit_transform(sizes)

    # Calculating cosine similarity for sizes
    cosine_similarities_sizes = cosine_similarity(tfidf_matrix_sizes[0:1], tfidf_matrix_sizes[1:]).flatten()
    similarity_items['Size Similarity'] = cosine_similarities_sizes

    # Remove input item
    similarity_items = similarity_items[similarity_items['Item Number – 8 digit'] != reference_item_number]

    # Define true similarity
    w1 = 1
    w2 = 1
    similarity_items['True Similarity'] = (w1 * similarity_items['Similarity'] + w2 * similarity_items['Size Similarity']) / (w1 + w2)

    # Cleaning and sort the data
    similarity_items = similarity_items.drop(columns=['Similarity', 'Size Similarity'])
    similarity_items = similarity_items.sort_values(by=['True Similarity'], ascending=False)
    
    
    return similarity_items, matching_row

In [11]:
inputData = CleanData('Daily Snapshot.csv')
data, input = ExactDrugAlgoFunction(10000077, inputData)

input

# data.head(10)

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
6,10000077,3161112,30,1,164698,130386,0,137248,N,entecavir,ORAL TABLET 0.5 MG,BARACLUDE,0.5 MG TAB 30


In [13]:
data.head(20)

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,True Similarity
7,10000082,3161212,30,1,164698,130386,0,137248,N,entecavir,ORAL TABLET 1 MG,BARACLUDE,1 MG TAB 30,1.0
19885,10249597,42806065830,30,1,133305,739,0,800,N,entecavir,ORAL TABLET 0.5 MG,ENTECAVIR,0.5 MG TAB 30,1.0
36541,10229260,43547043703,30,1,133305,9233,0,10000,N,entecavir,ORAL TABLET 1 MG,ENTECAVIR,1 MG TAB 30,1.0
28613,10186738,51991089633,30,1,133305,18466,0,20000,N,entecavir,ORAL TABLET 1 MG,ENTECAVIR,1 MG TAB 30,1.0
24759,10184880,68382092106,30,1,133305,7239,0,7840,N,entecavir,ORAL TABLET 1 MG,ENTECAVIR,1 MG TAB 30,1.0
24756,10184859,68382092006,30,1,133305,7239,0,7840,N,entecavir,ORAL TABLET 0.5 MG,ENTECAVIR,0.5 MG TAB 30,1.0
19886,10249686,42806065930,30,1,133305,831,0,900,N,entecavir,ORAL TABLET 1 MG,ENTECAVIR,1 MG TAB 30,1.0
36542,10229261,43547043603,30,1,133305,9233,0,10000,N,entecavir,ORAL TABLET 0.5 MG,ENTECAVIR,0.5 MG TAB 30,1.0
12320,10161780,31722083330,30,1,133297,2954,0,4000,C,entecavir,ORAL TABLET 0.5 MG,ENTECAVIR,0.5MG TAB 30,0.706612
12119,10160949,65862084130,30,1,133305,5910,0,6400,N,entecavir,ORAL TABLET 0.5 MG,ENTECAVIR,0.5MG TAB 30,0.706612


### Exract Dosage Information

In [19]:
import re
# Adjust function to extract dosage information including MG, %, and ML
def extractDosage(size_str):
    # Ensure the input is a string
    size_str = str(size_str)
    # Initialize a dictionary to hold the extracted values
    extracted_values = {'MG': None, '%': None, 'ML': None, 'GM': None, 'MCG': None, 'M': None, 'OZ': None, 'IU': None, 'MEQ': None, 'UN': None, 'MM': None, 'HR': None, 'MMOL': None, 'KG': None, 'BP': None, 'L': None, 'CM': None, 'CC': None, 'CAL': None, 'LB': None, 'IN': None, 'GR': None, 'GAL': None, 'LT': None, 'USP': None}

    # Turn string to uppercase for case-insensitive matching
    size_str = size_str.upper()
    
    # Simplified logic for MG, %, ML extraction
    # Looks for a number (with optional decimal) immediately before the unit, with optional space
    for unit in ['MG', '%', 'ML', 'GM', 'MCG', 'M', 'OZ', 'IU', 'MEQ', 'UN', 'MM', 'HR', 'MMOL', 'KG', 'BP', 'L', 'CM', 'CC', 'CAL', 'LB', 'IN', 'GR', 'GAL', 'LT', 'USP']:
        pattern = r'(\d+(?:\.\d*)?)\s*{}'.format(unit)
        match = re.search(pattern, size_str, re.IGNORECASE)
        if match:
            # Convert matched value to float and assign to the correct unit
            extracted_values[unit] = float(match.group(1))
    
    return extracted_values


# Apply the adjusted function to extract all values
df_updated_extracted = inputData['Size'].apply(extractDosage)

# Update the DataFrame with the new extracted values
inputData['MG'] = df_updated_extracted.apply(lambda x: x['MG'])
inputData['%'] = df_updated_extracted.apply(lambda x: x['%'])
inputData['ML'] = df_updated_extracted.apply(lambda x: x['ML'])
inputData['GM'] = df_updated_extracted.apply(lambda x: x['GM'])
inputData['MCG'] = df_updated_extracted.apply(lambda x: x['MCG'])
inputData['M'] = df_updated_extracted.apply(lambda x: x['M'])
inputData['OZ'] = df_updated_extracted.apply(lambda x: x['OZ'])
inputData['IU'] = df_updated_extracted.apply(lambda x: x['IU'])
inputData['MEQ'] = df_updated_extracted.apply(lambda x: x['MEQ'])
inputData['UN'] = df_updated_extracted.apply(lambda x: x['UN'])
inputData['MM'] = df_updated_extracted.apply(lambda x: x['MM'])
inputData['HR'] = df_updated_extracted.apply(lambda x: x['HR'])
inputData['MMOL'] = df_updated_extracted.apply(lambda x: x['MMOL'])
inputData['KG'] = df_updated_extracted.apply(lambda x: x['KG'])
inputData['BP'] = df_updated_extracted.apply(lambda x: x['BP'])
inputData['L'] = df_updated_extracted.apply(lambda x: x['L'])
inputData['CM'] = df_updated_extracted.apply(lambda x: x['CM'])
inputData['CC'] = df_updated_extracted.apply(lambda x: x['CC'])
inputData['CAL'] = df_updated_extracted.apply(lambda x: x['CAL'])
inputData['LB'] = df_updated_extracted.apply(lambda x: x['LB'])
inputData['IN'] = df_updated_extracted.apply(lambda x: x['IN'])
inputData['GR'] = df_updated_extracted.apply(lambda x: x['GR'])
inputData['GAL'] = df_updated_extracted.apply(lambda x: x['GAL'])
inputData['LT'] = df_updated_extracted.apply(lambda x: x['LT'])
inputData['USP'] = df_updated_extracted.apply(lambda x: x['USP'])


# Display the updated DataFrame
inputData.to_csv('inputDataProcessed.csv', index=False)
inputData.head(10)

## Package Analysis