In [3]:
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer

def select_random_item_number():
    # Load the data from the CSV file
    data = pd.read_csv('Daily Snapshot.csv')

    # Check if the 'Item Number – 8 digit' column exists
    if 'Item Number – 8 digit' in data.columns:
        # Select a random 'Item Number – 8 digit'
        random_item_number = random.choice(data['Item Number – 8 digit'].tolist())
        return random_item_number

def ExactDrugAlgoFunction(drug_code):
    # Item number of the drug to run the similarity test on
    reference_item_number = drug_code

    # Let's open and read the content of the uploaded file to understand its structure and data.
    file_path = 'Daily Snapshot.csv'

    # Reading the file

    # Load the data from the CSV file
    data = pd.read_csv(file_path)

    # Removing the specified columns and reordering the 'Item Number – 8 digit' column
    columns_to_remove = ['Item Number – 6 digit', 'UPC Number', 'Constant', 
                        'Customer-Specific Item Number', 'Pack Size Divisor', 
                        'RX/OTC Indicator']

    # Removing the columns
    data_cleaned = data.drop(columns=columns_to_remove)

    # Reordering 'Item Number – 8 digit' to the left
    column_to_move = data_cleaned.pop('Item Number – 8 digit')
    data_cleaned.insert(0, 'Item Number – 8 digit', column_to_move)

    # Moving all price columns and the contract flag to the right
    columns_to_move = ['AWP Price', 'Acquisition Price', 'Retail Price', 'WAC Price', 'Contract Flag']
    for col in columns_to_move:
        data_cleaned[col] = data_cleaned.pop(col)

    import re

    # Function to split the generic description into generic name and form
    def split_description(desc):
        match = re.search(r'[A-Z]', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Generic Description'
    data_cleaned['Generic Name'], data_cleaned['Form'] = zip(*data_cleaned['Generic Description'].apply(split_description))
    data_cleaned.drop(columns=['Generic Description'], inplace=True)

    # Removing rows where 'Generic Name' is empty or whitespace
    data_cleaned = data_cleaned[data_cleaned['Generic Name'].str.strip() != '']

    # Function to split the description into name and size
    def split_description_on_number(desc):
        match = re.search(r'\d', desc)
        if match:
            index = match.start()
            return desc[:index].strip(), desc[index:].strip()
        else:
            return desc, ''

    # Applying the function to split 'Description'
    data_cleaned['Name'], data_cleaned['Size'] = zip(*data_cleaned['Description'].apply(split_description_on_number))
    data_cleaned.drop(columns=['Description'], inplace=True)

    # Find and print the row for the given reference_item_number
    matching_row = data_cleaned[data_cleaned['Item Number – 8 digit'] == reference_item_number]
    

    # Find the Generic Name for the given reference_item_number
    reference_generic_name = data_cleaned.loc[data_cleaned['Item Number – 8 digit'] == reference_item_number, 'Generic Name'].iloc[0]


    # Create a copy of the dataframe filtered by Generic Name
    data_lidocaine = data_cleaned[data_cleaned['Generic Name'] == reference_generic_name].copy()

    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    # Selecting the reference item
    reference_item = data_lidocaine[data_lidocaine['Item Number – 8 digit'] == reference_item_number]
    if reference_item.empty:
        return "Reference item not found in the dataset."

    # Extracting the form of the reference item
    reference_form = reference_item.iloc[0]['Form']
    forms = data_lidocaine['Form'].tolist()
    forms.insert(0, reference_form)

    # Vectorizing the forms using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(forms)

    # Calculating cosine similarity
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    data_lidocaine['Similarity'] = cosine_similarities

    # Filtering the dataset to show only items with a similarity score above 0.9
    similarity_items = data_lidocaine[data_lidocaine['Similarity'] > 0].copy()

    # Extracting the size of the reference item
    reference_size = reference_item.iloc[0]['Size']
    sizes = similarity_items['Size'].tolist()
    sizes.insert(0, reference_size)

    # Vectorizing the sizes using TF-IDF
    tfidf_matrix_sizes = vectorizer.fit_transform(sizes)

    # Calculating cosine similarity for sizes
    cosine_similarities_sizes = cosine_similarity(tfidf_matrix_sizes[0:1], tfidf_matrix_sizes[1:]).flatten()
    similarity_items['Size Similarity'] = cosine_similarities_sizes

    # Filtering by size similarity
    high_similarity_items_filtered_by_size = similarity_items[similarity_items['Size Similarity'] > 0]

    # Remove the first row
    high_similarity_items_filtered_by_size = high_similarity_items_filtered_by_size.iloc[1:]
    
    high_similarity_items_filtered_by_size = high_similarity_items_filtered_by_size.sort_values(by=['Similarity'], ascending=False)
    high_similarity_items_filtered_by_size_2 = high_similarity_items_filtered_by_size.sort_values(by=['Size Similarity'], ascending=False)
    return high_similarity_items_filtered_by_size.head(10), high_similarity_items_filtered_by_size_2.head(10), matching_row

# Example usage
# results = ExactDrugAlgoFunction(your_drug_code)
# print(results.head())


In [88]:
random_item_number = select_random_item_number()
similarity, size_similarity, matching_row = ExactDrugAlgoFunction(10016113)
matching_row

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
4452,10016113,59762332801,100,1,11911,1846,0,1999,N,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN,150 MG CAP 100


In [89]:
similarity

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Similarity,Size Similarity
2137,10011211,59762501002,100,1,37594,4231,0,4582,N,clindamycin,HCl ORAL CAPSULE 3,CLINDAMYCIN HCL,300 MG CAP 100,1.0,0.55659
15957,10041075,68084024401,100,1,12982,6151,0,11058,C,clindamycin,HCl ORAL CAPSULE 3,CLINDAMYCIN HCL,300 MG CAP 100,1.0,0.55659
39529,10237145,51407037301,100,1,4281,1318,0,1427,N,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN HCL,150 MG CAP 100,1.0,1.0
36552,10229225,42571025201,100,1,37171,1831,0,4300,C,clindamycin,HCl ORAL CAPSULE 3,CLINDAMYCIN HCL,300 MG CAP 100,1.0,0.55659
36522,10229212,42571025001,100,1,7249,2787,0,4995,C,clindamycin,HCl ORAL CAPSULE 7,CLINDAMYCIN HCL,75 MG CAP 100,1.0,0.400375
36451,10229003,42571025101,100,1,11915,922,0,1435,C,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN HCL,150 MG CAP 100,1.0,1.0
36105,10055487,63304069205,500,1,59575,6887,0,7459,N,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN,150 MG CAP 500,1.0,0.550392
32804,10189304,65862018601,100,1,37171,3973,0,4302,N,clindamycin,HCl ORAL CAPSULE 3,CLINDAMYCIN HCL,300 MG CAP 100,1.0,0.55659
26983,10265524,904719461,100,1,13069,10056,0,10891,N,clindamycin,HCl ORAL CAPSULE 3,CLINDAMYCIN HCL,300MG CAP 100,1.0,0.281261
15956,10041074,68084024301,100,1,6271,1645,0,4390,C,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN,150 MG CAP 100 UD,1.0,0.673491


In [90]:
size_similarity

Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Similarity,Size Similarity
39529,10237145,51407037301,100,1,4281,1318,0,1427,N,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN HCL,150 MG CAP 100,1.0,1.0
4452,10016113,59762332801,100,1,11911,1846,0,1999,N,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN,150 MG CAP 100,1.0,1.0
5093,10017208,63304069201,100,1,11915,1180,0,1435,C,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN,150 MG CAP 100,1.0,1.0
36451,10229003,42571025101,100,1,11915,922,0,1435,C,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN HCL,150 MG CAP 100,1.0,1.0
7213,10020599,9022502,100,1,1747,1384,0,1456,N,clindamycin,HCl ORAL CAPSULE 1,CLEOCIN,150 MG CAP 100,1.0,1.0
15956,10041074,68084024301,100,1,6271,1645,0,4390,C,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN,150 MG CAP 100 UD,1.0,0.673491
11307,10029943,904595961,100,1,7305,3046,0,3299,N,clindamycin,HCl ORAL CAPSULE 1,CLINDAMYCIN,150 MG CAP 100 UD,1.0,0.673491
15957,10041075,68084024401,100,1,12982,6151,0,11058,C,clindamycin,HCl ORAL CAPSULE 3,CLINDAMYCIN HCL,300 MG CAP 100,1.0,0.55659
7464,10021238,9039514,100,1,3410,2700,0,2842,N,clindamycin,HCl ORAL CAPSULE 3,CLEOCIN HCL,300 MG CAP 100,1.0,0.55659
2137,10011211,59762501002,100,1,37594,4231,0,4582,N,clindamycin,HCl ORAL CAPSULE 3,CLINDAMYCIN HCL,300 MG CAP 100,1.0,0.55659
