In [15]:
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer

def select_random_item_number():
    # Load the data from the CSV file
    data = pd.read_csv('Daily Snapshot.csv')

    # Check if the 'Item Number – 8 digit' column exists
    if 'Item Number – 8 digit' in data.columns:
        # Select a random 'Item Number – 8 digit'
        random_item_number = random.choice(data['Item Number – 8 digit'].tolist())
        return random_item_number

def ExactDrugAlgoFunction(drug_code):
    # Item number of the drug to run the similarity test on
    reference_item_number = drug_code

    # Let's open and read the content of the uploaded file to understand its structure and data.
    file_path = 'Daily Snapshot.csv'

    # Reading the file

    # Load the data from the CSV file
    data = pd.read_csv(file_path)

    # Removing the specified columns and reordering the 'Item Number – 8 digit' column
    columns_to_remove = ['Item Number – 6 digit', 'UPC Number', 'Constant', 
                        'Customer-Specific Item Number', 'Pack Size Divisor', 
                        'RX/OTC Indicator']

    # Removing the columns
    data_cleaned = data.drop(columns=columns_to_remove)

    # Reordering 'Item Number – 8 digit' to the left
    column_to_move = data_cleaned.pop('Item Number – 8 digit')
    data_cleaned.insert(0, 'Item Number – 8 digit', column_to_move)

    # Moving all price columns and the contract flag to the right
    columns_to_move = ['AWP Price', 'Acquisition Price', 'Retail Price', 'WAC Price', 'Contract Flag']
    for col in columns_to_move:
        data_cleaned[col] = data_cleaned.pop(col)

    import re

    # Function to split the generic description into generic name and form
    def split_description(desc):
        match = re.search(r'[A-Z]', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Generic Description'
    data_cleaned['Generic Name'], data_cleaned['Form'] = zip(*data_cleaned['Generic Description'].apply(split_description))
    data_cleaned.drop(columns=['Generic Description'], inplace=True)

    # Removing rows where 'Generic Name' is empty or whitespace
    data_cleaned = data_cleaned[data_cleaned['Generic Name'].str.strip() != '']

    # Function to split the description into name and size
    def split_description_on_number(desc):
        match = re.search(r'\d', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Description'
    data_cleaned['Name'], data_cleaned['Size'] = zip(*data_cleaned['Description'].apply(split_description_on_number))
    data_cleaned.drop(columns=['Description'], inplace=True)

    # Find and print the row for the given reference_item_number
    matching_row = data_cleaned[data_cleaned['Item Number – 8 digit'] == reference_item_number]
    

    # Find the Generic Name for the given reference_item_number
    reference_generic_name = data_cleaned.loc[data_cleaned['Item Number – 8 digit'] == reference_item_number, 'Generic Name'].iloc[0]


    # Create a copy of the dataframe filtered by Generic Name
    data_lidocaine = data_cleaned[data_cleaned['Generic Name'] == reference_generic_name].copy()

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    # Selecting the reference item
    reference_item = data_lidocaine[data_lidocaine['Item Number – 8 digit'] == reference_item_number]
    if reference_item.empty:
        return "Reference item not found in the dataset."

    # Extracting the form of the reference item
    reference_form = reference_item.iloc[0]['Form']
    forms = data_lidocaine['Form'].tolist()
    forms.insert(0, reference_form)

    # Vectorizing the forms using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(forms)

    # Calculating cosine similarity
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    data_lidocaine['Similarity'] = cosine_similarities

    # Filtering the dataset to show only items with a similarity score above 0.9
    similarity_items = data_lidocaine[data_lidocaine['Similarity'] > 0].copy()

    # Extracting the size of the reference item
    reference_size = reference_item.iloc[0]['Size']
    sizes = similarity_items['Size'].tolist()
    sizes.insert(0, reference_size)

    # Vectorizing the sizes using TF-IDF
    tfidf_matrix_sizes = vectorizer.fit_transform(sizes)

    # Calculating cosine similarity for sizes
    cosine_similarities_sizes = cosine_similarity(tfidf_matrix_sizes[0:1], tfidf_matrix_sizes[1:]).flatten()
    similarity_items['Size Similarity'] = cosine_similarities_sizes

    # Filtering by size similarity
    high_similarity_items_filtered_by_size = similarity_items[similarity_items['Size Similarity'] > 0]

    # Remove the first row
    high_similarity_items_filtered_by_size = high_similarity_items_filtered_by_size.iloc[1:]
    
    high_similarity_items_filtered_by_size = high_similarity_items_filtered_by_size.sort_values(by=['Similarity'], ascending=False)
    high_similarity_items_filtered_by_size_2 = high_similarity_items_filtered_by_size.sort_values(by=['Size Similarity'], ascending=False)
    return high_similarity_items_filtered_by_size, matching_row

# Example usage
# results = ExactDrugAlgoFunction(your_drug_code)
# print(results.head())


In [57]:
#random_item_number = select_random_item_number()
data, input = ExactDrugAlgoFunction(10283862)

In [60]:
input

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
23262,10283862,57896098016,473,1,539,446,669,405,N,diphenhydramine,HCl ORAL LIQUI,GERI-DRYL ALRGY RLF,12.5MG-5ML


In [59]:
data.head(10)

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Similarity,Size Similarity
16689,10042965,24385037926,4,1,449,110,165,100,N,diphenhydramine,HCl ORAL LIQUI,GNP DIPHENHYDRA CHILD,12.5MG C,1.0,0.755689
20306,10253743,69339015117,40X5,1,10200,5927,8891,8500,C,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE HCL,12.5MG SOL,1.0,0.62203
20308,10253805,69339015119,100X5,1,24612,13815,20723,20510,C,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE,12.5 MG SOL 10,1.0,0.225527
20763,10252187,121086530,30X5,1,7674,7035,10553,7035,N,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE,12.5MG SOL 30X,1.0,0.438869
22385,10255186,121086500,100X5,1,24612,10285,15428,20510,C,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE,12.5MG SOL 100,1.0,0.512303
23262,10283862,57896098016,473,1,539,446,669,405,N,diphenhydramine,HCl ORAL LIQUI,GERI-DRYL ALRGY RLF,12.5MG-5ML,1.0,1.0
26646,10261857,81033000350,50X5,1,0,6875,10313,6875,N,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE HCL,12.5MG SOL,1.0,0.62203
29592,10280059,57237030512,118,1,540,495,743,495,N,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE HCL,12.5MG-5ML,1.0,1.0
36654,10229357,58657052816,473,1,1796,1306,1959,1306,N,diphenhydramine,HCl ORAL LIQUI,M-DRYL,12.5MG/5ML SOL 473 ML,1.0,0.611085
36655,10229359,58657052804,120,1,449,373,560,373,N,diphenhydramine,HCl ORAL LIQUI,M-DRYL,12.5MG/5ML SOL 120 ML,1.0,0.611085


In [65]:
w1 = 2
w2 = 1
data['True Similarity'] = (w1 * data['Similarity'] + w2 * data['Size Similarity']) / (w1 + w2)

data = data.sort_values(by=['True Similarity'], ascending=False)

data.head(20)

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Similarity,Size Similarity,True Similarity
23262,10283862,57896098016,473,1,539,446,669,405,N,diphenhydramine,HCl ORAL LIQUI,GERI-DRYL ALRGY RLF,12.5MG-5ML,1.0,1.0,1.0
29592,10280059,57237030512,118,1,540,495,743,495,N,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE HCL,12.5MG-5ML,1.0,1.0,1.0
36769,10229731,904698516,473,1,564,239,359,470,C,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE,12.5MG-5ML SOL,1.0,0.886734,0.962245
38207,10233480,904698520,118,1,287,120,180,239,C,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE,12.5MG-5ML SOL,1.0,0.886734,0.962245
16689,10042965,24385037926,4,1,449,110,165,100,N,diphenhydramine,HCl ORAL LIQUI,GNP DIPHENHYDRA CHILD,12.5MG C,1.0,0.755689,0.918563
20306,10253743,69339015117,40X5,1,10200,5927,8891,8500,C,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE HCL,12.5MG SOL,1.0,0.62203,0.87401
26646,10261857,81033000350,50X5,1,0,6875,10313,6875,N,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE HCL,12.5MG SOL,1.0,0.62203,0.87401
36654,10229357,58657052816,473,1,1796,1306,1959,1306,N,diphenhydramine,HCl ORAL LIQUI,M-DRYL,12.5MG/5ML SOL 473 ML,1.0,0.611085,0.870362
36655,10229359,58657052804,120,1,449,373,560,373,N,diphenhydramine,HCl ORAL LIQUI,M-DRYL,12.5MG/5ML SOL 120 ML,1.0,0.611085,0.870362
22385,10255186,121086500,100X5,1,24612,10285,15428,20510,C,diphenhydramine,HCl ORAL LIQUI,DIPHENHYDRAMINE,12.5MG SOL 100,1.0,0.512303,0.837434


In [62]:
input

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
23262,10283862,57896098016,473,1,539,446,669,405,N,diphenhydramine,HCl ORAL LIQUI,GERI-DRYL ALRGY RLF,12.5MG-5ML
