In [36]:
def ExactDrugAlgoFunction(drug_code):
    # Item number of the drug to run the similarity test on
    reference_item_number = drug_code

    # Let's open and read the content of the uploaded file to understand its structure and data.
    file_path = 'Daily Snapshot.csv'

    # Reading the file
    import pandas as pd

    # Load the data from the CSV file
    data = pd.read_csv(file_path)

    # Removing the specified columns and reordering the 'Item Number – 8 digit' column
    columns_to_remove = ['Item Number – 6 digit', 'UPC Number', 'Constant', 
                        'Customer-Specific Item Number', 'Pack Size Divisor', 
                        'RX/OTC Indicator']

    # Removing the columns
    data_cleaned = data.drop(columns=columns_to_remove)

    # Reordering 'Item Number – 8 digit' to the left
    column_to_move = data_cleaned.pop('Item Number – 8 digit')
    data_cleaned.insert(0, 'Item Number – 8 digit', column_to_move)

    # Moving all price columns and the contract flag to the right
    columns_to_move = ['AWP Price', 'Acquisition Price', 'Retail Price', 'WAC Price', 'Contract Flag']
    for col in columns_to_move:
        data_cleaned[col] = data_cleaned.pop(col)

    import re

    # Function to split the generic description into generic name and form
    def split_description(desc):
        match = re.search(r'[A-Z]', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Generic Description'
    data_cleaned['Generic Name'], data_cleaned['Form'] = zip(*data_cleaned['Generic Description'].apply(split_description))
    data_cleaned.drop(columns=['Generic Description'], inplace=True)

    # Removing rows where 'Generic Name' is empty or whitespace
    data_cleaned = data_cleaned[data_cleaned['Generic Name'].str.strip() != '']

    # Function to split the description into name and size
    def split_description_on_number(desc):
        match = re.search(r'\d', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Description'
    data_cleaned['Name'], data_cleaned['Size'] = zip(*data_cleaned['Description'].apply(split_description_on_number))
    data_cleaned.drop(columns=['Description'], inplace=True)

    # Find the Generic Name for the given reference_item_number
    reference_generic_name = data_cleaned.loc[data_cleaned['Item Number – 8 digit'] == reference_item_number, 'Generic Name'].iloc[0]

    # Create a copy of the dataframe filtered by Generic Name
    data_lidocaine = data_cleaned[data_cleaned['Generic Name'] == reference_generic_name].copy()

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    # Selecting the reference item
    reference_item = data_lidocaine[data_lidocaine['Item Number – 8 digit'] == reference_item_number]
    if reference_item.empty:
        return "Reference item not found in the dataset."

    # Extracting the form of the reference item
    reference_form = reference_item.iloc[0]['Form']
    forms = data_lidocaine['Form'].tolist()
    forms.insert(0, reference_form)

    # Vectorizing the forms using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(forms)

    # Calculating cosine similarity
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    data_lidocaine['Similarity'] = cosine_similarities

    # Filtering the dataset to show only items with a similarity score above 0.9
    similarity_items = data_lidocaine[data_lidocaine['Similarity'] > 0].copy()

    # Extracting the size of the reference item
    reference_size = reference_item.iloc[0]['Size']
    sizes = similarity_items['Size'].tolist()
    sizes.insert(0, reference_size)

    # Vectorizing the sizes using TF-IDF
    tfidf_matrix_sizes = vectorizer.fit_transform(sizes)

    # Calculating cosine similarity for sizes
    cosine_similarities_sizes = cosine_similarity(tfidf_matrix_sizes[0:1], tfidf_matrix_sizes[1:]).flatten()
    similarity_items['Size Similarity'] = cosine_similarities_sizes

    # Filtering by size similarity
    high_similarity_items_filtered_by_size = similarity_items[similarity_items['Size Similarity'] > 0]

    # Remove the first row
    high_similarity_items_filtered_by_size = high_similarity_items_filtered_by_size.iloc[1:]
    
    high_similarity_items_filtered_by_size = high_similarity_items_filtered_by_size.sort_values(by=['Similarity'], ascending=False)
    high_similarity_items_filtered_by_size_2 = high_similarity_items_filtered_by_size.sort_values(by=['Size Similarity'], ascending=False)
    return high_similarity_items_filtered_by_size.head(10), high_similarity_items_filtered_by_size_2.head(10)

# Example usage
# results = ExactDrugAlgoFunction(your_drug_code)
# print(results.head())


In [37]:
similarity, size_similarity = ExactDrugAlgoFunction(10104126)
similarity

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Similarity,Size Similarity
3584,10104126,76329301205,25X5,1,17250,11036,0,14375,C,lidocaine,HCl MUCOUS MEM JEL/P,LIDOCAINE HCL,2% UROJET 25X5 M,1.0,1.0
3796,10105336,76329301105,25X5,1,18000,11036,0,15000,C,lidocaine,HCl MUCOUS MEM JEL/P,LIDOCAINE HCL,2% UROJET 25X5 M,1.0,1.0
3797,10105337,76329301305,25X10,1,19230,13180,0,16025,C,lidocaine,HCl MUCOUS MEM JEL/P,LIDOCAINE HCL,2% UROJET 25X10,1.0,0.559412
4014,10106047,76329301505,25X20,1,27750,19020,0,23125,C,lidocaine,HCl MUCOUS MEM JEL/P,LIDOCAINE HCL,2% UROJET 25X20,1.0,0.576749
2232,10096249,63323049507,25X5,1,10770,8287,0,8975,N,lidocaine,HCl/PF INJECTION VIA,XYLOCAINE-MPF,2% SDV 25X5 ML,0.061751,0.443028
9539,10025375,409427801,25X50,1,12962,9095,0,10802,C,lidocaine,HCl/PF INJECTION VIA,LIDOCAINE HCL PF,0.5% SDV 25X5,0.061751,0.491206
19090,10180260,143959525,25X5,1,7380,2715,0,6150,C,lidocaine,HCl/PF INJECTION VIA,LIDOCAINE HCL,1 % SDV 25X5 ML,0.061751,0.443028
19092,10180263,143959425,25X5,1,6630,2240,0,5525,C,lidocaine,HCl/PF INJECTION VIA,LIDOCAINE HCL,2 % SDV 25X5 ML,0.061751,0.443028
30952,10275575,71351002125,25X5,1,7469,2401,0,5975,C,lidocaine,HCl/PF INJECTION VIA,LIDOCAINE HCL MF,1 % SDV 25X5,0.061751,0.491206
9545,10025389,409428301,25X5,1,14504,7848,0,12087,C,lidocaine,HCl/PF INJECTION AMP,LIDOCAINE HCL PF,4% AMP 25X5 M,0.055919,0.445631


In [38]:
size_similarity

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Similarity,Size Similarity
3584,10104126,76329301205,25X5,1,17250,11036,0,14375,C,lidocaine,HCl MUCOUS MEM JEL/P,LIDOCAINE HCL,2% UROJET 25X5 M,1.0,1.0
3796,10105336,76329301105,25X5,1,18000,11036,0,15000,C,lidocaine,HCl MUCOUS MEM JEL/P,LIDOCAINE HCL,2% UROJET 25X5 M,1.0,1.0
4014,10106047,76329301505,25X20,1,27750,19020,0,23125,C,lidocaine,HCl MUCOUS MEM JEL/P,LIDOCAINE HCL,2% UROJET 25X20,1.0,0.576749
3797,10105337,76329301305,25X10,1,19230,13180,0,16025,C,lidocaine,HCl MUCOUS MEM JEL/P,LIDOCAINE HCL,2% UROJET 25X10,1.0,0.559412
9539,10025375,409427801,25X50,1,12962,9095,0,10802,C,lidocaine,HCl/PF INJECTION VIA,LIDOCAINE HCL PF,0.5% SDV 25X5,0.061751,0.491206
30952,10275575,71351002125,25X5,1,7469,2401,0,5975,C,lidocaine,HCl/PF INJECTION VIA,LIDOCAINE HCL MF,1 % SDV 25X5,0.061751,0.491206
9545,10025389,409428301,25X5,1,14504,7848,0,12087,C,lidocaine,HCl/PF INJECTION AMP,LIDOCAINE HCL PF,4% AMP 25X5 M,0.055919,0.445631
9581,10025442,409471302,25X5,1,5124,2170,0,4270,C,lidocaine,HCl/PF INJECTION AMP,LIDOCAINE HCL PF,1% AMP 25X5 M,0.055919,0.445631
2232,10096249,63323049507,25X5,1,10770,8287,0,8975,N,lidocaine,HCl/PF INJECTION VIA,XYLOCAINE-MPF,2% SDV 25X5 ML,0.061751,0.443028
19090,10180260,143959525,25X5,1,7380,2715,0,6150,C,lidocaine,HCl/PF INJECTION VIA,LIDOCAINE HCL,1 % SDV 25X5 ML,0.061751,0.443028
