In [44]:
# Let's open and read the content of the uploaded file to understand its structure and data.
file_path = 'Daily Snapshot.csv'

# Reading the file
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv(file_path)
data.head()  # Display the first few rows of the dataframe to understand its structure.


Unnamed: 0,Item Number – 6 digit,NDC Number,UPC Number,Constant,Customer-Specific Item Number,Description,Pack Size Divisor,Size Qty,RX/OTC Indicator,AWP Price,Acquisition Price,Retail Price,Contract Flag,Generic Description,Retail Pack Quantity,WAC Price,Item Number – 8 digit
0,1091,51672408306.0,351672408367,,,HYDROCORTISONE BUTYRATE 0.1% O,45000,45.0,R,17928,9425,0,C,hydrocortisone butyrate TOPICA,1,14342,10000009
1,1103,23334807.0,300233348079,,,OZURDEX IMP 1 DS,1000,,R,164760,137300,0,N,dexamethasone INTRAOCULR IMPLA,1,137300,10083412
2,1141,93227234.0,300932272347,,,AMOXICILLIN-CLAV K 400-57 MG C,20000,20.0,R,11751,5038,0,C,amoxicillin/potassium clav ORA,1,8814,10000013
3,1457,51991062033.0,351991620334,,,ANASTROZOLE 1 MG TAB 30,30000,30.0,R,40038,831,0,N,anastrozole ORAL TABLET 1 MG,1,900,10083430
4,1479,,70610209162,,,BERRY OINTMENT JAR 916 WHITE 1,1000,12.0,O,0,4641,6962,N,,1,4641,10000056


In [45]:
# Removing the specified columns and reordering the 'Item Number – 8 digit' column
columns_to_remove = ['Item Number – 6 digit', 'UPC Number', 'Constant', 
                     'Customer-Specific Item Number', 'Pack Size Divisor', 
                     'RX/OTC Indicator']

# Removing the columns
data_cleaned = data.drop(columns=columns_to_remove)

# Reordering 'Item Number – 8 digit' to the left
column_to_move = data_cleaned.pop('Item Number – 8 digit')
data_cleaned.insert(0, 'Item Number – 8 digit', column_to_move)

# Displaying the first few rows of the modified dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Description,Size Qty,AWP Price,Acquisition Price,Retail Price,Contract Flag,Generic Description,Retail Pack Quantity,WAC Price
0,10000009,51672408306.0,HYDROCORTISONE BUTYRATE 0.1% O,45.0,17928,9425,0,C,hydrocortisone butyrate TOPICA,1,14342
1,10083412,23334807.0,OZURDEX IMP 1 DS,,164760,137300,0,N,dexamethasone INTRAOCULR IMPLA,1,137300
2,10000013,93227234.0,AMOXICILLIN-CLAV K 400-57 MG C,20.0,11751,5038,0,C,amoxicillin/potassium clav ORA,1,8814
3,10083430,51991062033.0,ANASTROZOLE 1 MG TAB 30,30.0,40038,831,0,N,anastrozole ORAL TABLET 1 MG,1,900
4,10000056,,BERRY OINTMENT JAR 916 WHITE 1,12.0,0,4641,6962,N,,1,4641


In [46]:
# Moving all price columns and the contract flag to the right
columns_to_move = ['AWP Price', 'Acquisition Price', 'Retail Price', 'WAC Price', 'Contract Flag']

# Reordering the columns
for col in columns_to_move:
    data_cleaned[col] = data_cleaned.pop(col)

# Displaying the first few rows of the rearranged dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Description,Size Qty,Generic Description,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag
0,10000009,51672408306.0,HYDROCORTISONE BUTYRATE 0.1% O,45.0,hydrocortisone butyrate TOPICA,1,17928,9425,0,14342,C
1,10083412,23334807.0,OZURDEX IMP 1 DS,,dexamethasone INTRAOCULR IMPLA,1,164760,137300,0,137300,N
2,10000013,93227234.0,AMOXICILLIN-CLAV K 400-57 MG C,20.0,amoxicillin/potassium clav ORA,1,11751,5038,0,8814,C
3,10083430,51991062033.0,ANASTROZOLE 1 MG TAB 30,30.0,anastrozole ORAL TABLET 1 MG,1,40038,831,0,900,N
4,10000056,,BERRY OINTMENT JAR 916 WHITE 1,12.0,,1,0,4641,6962,4641,N


In [47]:
import re

# Function to split the generic description into generic name and form
def split_description(desc):
    # Find the index of the first capital letter
    match = re.search(r'[A-Z]', desc)
    if match:
        index = match.start()
        return desc[:index].strip(), desc[index:].strip()
    else:
        return desc, ''

# Applying the function to the 'Generic Description' column
data_cleaned['Generic Name'], data_cleaned['Form'] = zip(*data_cleaned['Generic Description'].apply(split_description))

# Dropping the original 'Generic Description' column
data_cleaned.drop(columns=['Generic Description'], inplace=True)

# Displaying the first few rows of the updated dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Description,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form
0,10000009,51672408306.0,HYDROCORTISONE BUTYRATE 0.1% O,45.0,1,17928,9425,0,14342,C,hydrocortisone butyrate,TOPICA
1,10083412,23334807.0,OZURDEX IMP 1 DS,,1,164760,137300,0,137300,N,dexamethasone,INTRAOCULR IMPLA
2,10000013,93227234.0,AMOXICILLIN-CLAV K 400-57 MG C,20.0,1,11751,5038,0,8814,C,amoxicillin/potassium clav,ORA
3,10083430,51991062033.0,ANASTROZOLE 1 MG TAB 30,30.0,1,40038,831,0,900,N,anastrozole,ORAL TABLET 1 MG
4,10000056,,BERRY OINTMENT JAR 916 WHITE 1,12.0,1,0,4641,6962,4641,N,,


In [48]:
# Removing rows where the 'Generic Name' is empty or consists only of whitespace
data_cleaned = data_cleaned[data_cleaned['Generic Name'].str.strip() != '']

# Displaying the first few rows of the updated dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Description,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form
0,10000009,51672408306,HYDROCORTISONE BUTYRATE 0.1% O,45,1,17928,9425,0,14342,C,hydrocortisone butyrate,TOPICA
1,10083412,23334807,OZURDEX IMP 1 DS,,1,164760,137300,0,137300,N,dexamethasone,INTRAOCULR IMPLA
2,10000013,93227234,AMOXICILLIN-CLAV K 400-57 MG C,20,1,11751,5038,0,8814,C,amoxicillin/potassium clav,ORA
3,10083430,51991062033,ANASTROZOLE 1 MG TAB 30,30,1,40038,831,0,900,N,anastrozole,ORAL TABLET 1 MG
5,10083433,67457022005,ISOSULFAN BLUE 1% SDV 6X5 ML,6X5,1,763841,587712,0,636534,N,isosulfan blue,SUBCUT VIAL 1 %


In [49]:
# Function to split the description into name and size
def split_description_on_number(desc):
    # Find the index of the first number
    match = re.search(r'\d', desc)
    if match:
        index = match.start()
        return desc[:index].strip(), desc[index:].strip()
    else:
        return desc, ''

# Applying the function to the 'Description' column
data_cleaned['Name'], data_cleaned['Size'] = zip(*data_cleaned['Description'].apply(split_description_on_number))

# Dropping the original 'Description' column
data_cleaned.drop(columns=['Description'], inplace=True)

# Displaying the first few rows of the updated dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
0,10000009,51672408306,45,1,17928,9425,0,14342,C,hydrocortisone butyrate,TOPICA,HYDROCORTISONE BUTYRATE,0.1% O
1,10083412,23334807,,1,164760,137300,0,137300,N,dexamethasone,INTRAOCULR IMPLA,OZURDEX IMP,1 DS
2,10000013,93227234,20,1,11751,5038,0,8814,C,amoxicillin/potassium clav,ORA,AMOXICILLIN-CLAV K,400-57 MG C
3,10083430,51991062033,30,1,40038,831,0,900,N,anastrozole,ORAL TABLET 1 MG,ANASTROZOLE,1 MG TAB 30
5,10083433,67457022005,6X5,1,763841,587712,0,636534,N,isosulfan blue,SUBCUT VIAL 1 %,ISOSULFAN BLUE,1% SDV 6X5 ML


In [50]:
# Filtering the dataset for items with the exact generic name "lidocaine"
data_lidocaine = data_cleaned[data_cleaned['Generic Name'].str.lower() == 'cyclophosphamide']

data_lidocaine


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
7913,10141322,54038225,100.0,1,93620,69150,0,74894,N,cyclophosphamide,ORAL CAPSULE,CYCLOPHOSPHAM,25 MG CAP 100
7914,10141323,54038325,100.0,1,171811,126908,0,137450,N,cyclophosphamide,ORAL CAPSULE,CYCLOPHOSPHAM,50 MG CAP 100
8814,10145810,781323394,,1,41203,8144,0,28019,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,500 MG SDV
8815,10145698,781324494,,1,82406,18097,0,56036,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV
8816,10145699,781325594,,1,164813,40899,0,112073,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,2 GM SDV
14060,10253816,10019098401,100.0,1,50519,15327,0,42099,C,cyclophosphamide,ORAL TABLET 5,CYCLOPHOSPHAMIDE,50 MG TAB 100
14061,10253817,10019098201,100.0,1,35352,10711,0,29460,C,cyclophosphamide,ORAL TABLET 2,CYCLOPHOSPHAMIDE,25 MG TAB 100
17105,10176522,38779050605,100.0,1,1618330,69750,0,69750,N,cyclophosphamide,MISCELL POWDE,CYCLOPHOSPHAMIDE USP,100 GM DS
21404,10251866,68001044432,,1,175800,135264,0,146500,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,2 GM SDV
21405,10251921,68001044327,,1,87900,67632,0,73250,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Selecting the reference item
reference_item_number = 10061501
reference_item = data_lidocaine[data_lidocaine['Item Number – 8 digit'] == reference_item_number]

# Check if the reference item is found
if reference_item.empty:
    similarity_results = "Reference item not found in the dataset."
else:
    # Extracting the form of the reference item
    reference_form = reference_item.iloc[0]['Form']

    # Preparing data for TF-IDF vectorization (including the reference form)
    forms = data_lidocaine['Form'].tolist()
    forms.insert(0, reference_form)

    # Vectorizing the forms using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(forms)

    # Calculating cosine similarity
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Adding the similarity scores to the lidocaine dataset
    # We exclude the first element of cosine_similarities as it is the similarity of the reference item with itself
    data_lidocaine['Form Similarity'] = cosine_similarities

    # Displaying the dataset with similarity scores
    similarity_results = data_lidocaine

  # Display the first few rows of the results

# Filtering the dataset to show only items with a similarity score above 0.9
similarity_items = data_lidocaine[data_lidocaine['Form Similarity'] > 0.9]

similarity_items



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_lidocaine['Form Similarity'] = cosine_similarities


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Form Similarity
8814,10145810,781323394,,1,41203,8144,0,28019,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,500 MG SDV,1.0
8815,10145698,781324494,,1,82406,18097,0,56036,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV,1.0
8816,10145699,781325594,,1,164813,40899,0,112073,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,2 GM SDV,1.0
21404,10251866,68001044432,,1,175800,135264,0,146500,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,2 GM SDV,1.0
21405,10251921,68001044327,,1,87900,67632,0,73250,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV,1.0
21406,10251835,68001044226,,1,43950,33816,0,36625,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,500 MG SDV,1.0
22289,10253968,70860021803,2.5,1,43950,12580,0,36625,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,500 MG MDV 2.,1.0
22313,10254172,70860021805,5.0,1,87900,25183,0,73250,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM MDV 5 ML,1.0
22446,10238211,50742052005,5.0,1,87900,25183,0,73250,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM MDV 5 ML,1.0
22447,10238183,50742051902,2.5,1,43950,12580,0,36625,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,500 MG MDV 2.,1.0


In [52]:
# Extracting the size of the reference item
reference_size = reference_item.iloc[0]['Size']

# Preparing data for TF-IDF vectorization (including the reference size)
sizes = similarity_items['Size'].tolist()
sizes.insert(0, reference_size)

# Vectorizing the sizes using TF-IDF
tfidf_matrix_sizes = vectorizer.fit_transform(sizes)

# Calculating cosine similarity for sizes
cosine_similarities_sizes = cosine_similarity(tfidf_matrix_sizes[0:1], tfidf_matrix_sizes[1:]).flatten()

# Adding the similarity scores for sizes to the high_similarity_items dataset
# We exclude the first element of cosine_similarities_sizes as it is the similarity of the reference item with itself
similarity_items['Size Similarity'] = cosine_similarities_sizes

# Displaying the dataset with size similarity scores
high_similarity_items_filtered_by_size = similarity_items[similarity_items['Size Similarity'] > 0.25]
# high_similarity_items_filtered_by_size  # Display the first few rows of the results

# Sorting the results by 'Size Similarity' in descending order
sorted_similarity_items = high_similarity_items_filtered_by_size.sort_values(by='Size Similarity', ascending=False)
sorted_similarity_items  # Display the first few rows of the sorted results



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similarity_items['Size Similarity'] = cosine_similarities_sizes


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Form Similarity,Size Similarity
39772,10061502,10019095601,,1,87900,27223,0,73250,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV,1.0,1.0
8815,10145698,781324494,,1,82406,18097,0,56036,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV,1.0,1.0
8816,10145699,781325594,,1,164813,40899,0,112073,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,2 GM SDV,1.0,1.0
21404,10251866,68001044432,,1,175800,135264,0,146500,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,2 GM SDV,1.0,1.0
21405,10251921,68001044327,,1,87900,67632,0,73250,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV,1.0,1.0
39771,10061501,10019095701,,1,175800,51724,0,146500,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,2 GM SDV,1.0,1.0
31799,10188531,70121124001,,1,158220,37054,0,131850,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,2 GM SDV,1.0,1.0
31798,10188530,70121123901,,1,79110,17368,0,65925,C,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV,1.0,1.0
27507,10281650,43598066111,2.0,1,87600,67401,0,73000,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV 2 ML,1.0,0.625084
26602,10261764,55150027101,5.0,1,87600,67401,0,73000,N,cyclophosphamide,INTRAVEN VIAL,CYCLOPHOSPHAMIDE,1 GM SDV 5 ML,1.0,0.625084
