In [4]:
# Let's open and read the content of the uploaded file to understand its structure and data.
file_path = 'Daily Snapshot.csv'

# Reading the file
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv(file_path)
data.head()  # Display the first few rows of the dataframe to understand its structure.


Unnamed: 0,Item Number – 6 digit,NDC Number,UPC Number,Constant,Customer-Specific Item Number,Description,Pack Size Divisor,Size Qty,RX/OTC Indicator,AWP Price,Acquisition Price,Retail Price,Contract Flag,Generic Description,Retail Pack Quantity,WAC Price,Item Number – 8 digit
0,1091,51672408306.0,351672408367,,,HYDROCORTISONE BUTYRATE 0.1% O,45000,45.0,R,17928,9425,0,C,hydrocortisone butyrate TOPICA,1,14342,10000009
1,1103,23334807.0,300233348079,,,OZURDEX IMP 1 DS,1000,,R,164760,137300,0,N,dexamethasone INTRAOCULR IMPLA,1,137300,10083412
2,1141,93227234.0,300932272347,,,AMOXICILLIN-CLAV K 400-57 MG C,20000,20.0,R,11751,5038,0,C,amoxicillin/potassium clav ORA,1,8814,10000013
3,1457,51991062033.0,351991620334,,,ANASTROZOLE 1 MG TAB 30,30000,30.0,R,40038,831,0,N,anastrozole ORAL TABLET 1 MG,1,900,10083430
4,1479,,70610209162,,,BERRY OINTMENT JAR 916 WHITE 1,1000,12.0,O,0,4641,6962,N,,1,4641,10000056


In [5]:
# Removing the specified columns and reordering the 'Item Number – 8 digit' column
columns_to_remove = ['Item Number – 6 digit', 'UPC Number', 'Constant', 
                     'Customer-Specific Item Number', 'Pack Size Divisor', 
                     'RX/OTC Indicator']

# Removing the columns
data_cleaned = data.drop(columns=columns_to_remove)

# Reordering 'Item Number – 8 digit' to the left
column_to_move = data_cleaned.pop('Item Number – 8 digit')
data_cleaned.insert(0, 'Item Number – 8 digit', column_to_move)

# Displaying the first few rows of the modified dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Description,Size Qty,AWP Price,Acquisition Price,Retail Price,Contract Flag,Generic Description,Retail Pack Quantity,WAC Price
0,10000009,51672408306.0,HYDROCORTISONE BUTYRATE 0.1% O,45.0,17928,9425,0,C,hydrocortisone butyrate TOPICA,1,14342
1,10083412,23334807.0,OZURDEX IMP 1 DS,,164760,137300,0,N,dexamethasone INTRAOCULR IMPLA,1,137300
2,10000013,93227234.0,AMOXICILLIN-CLAV K 400-57 MG C,20.0,11751,5038,0,C,amoxicillin/potassium clav ORA,1,8814
3,10083430,51991062033.0,ANASTROZOLE 1 MG TAB 30,30.0,40038,831,0,N,anastrozole ORAL TABLET 1 MG,1,900
4,10000056,,BERRY OINTMENT JAR 916 WHITE 1,12.0,0,4641,6962,N,,1,4641


In [6]:
# Moving all price columns and the contract flag to the right
columns_to_move = ['AWP Price', 'Acquisition Price', 'Retail Price', 'WAC Price', 'Contract Flag']

# Reordering the columns
for col in columns_to_move:
    data_cleaned[col] = data_cleaned.pop(col)

# Displaying the first few rows of the rearranged dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Description,Size Qty,Generic Description,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag
0,10000009,51672408306.0,HYDROCORTISONE BUTYRATE 0.1% O,45.0,hydrocortisone butyrate TOPICA,1,17928,9425,0,14342,C
1,10083412,23334807.0,OZURDEX IMP 1 DS,,dexamethasone INTRAOCULR IMPLA,1,164760,137300,0,137300,N
2,10000013,93227234.0,AMOXICILLIN-CLAV K 400-57 MG C,20.0,amoxicillin/potassium clav ORA,1,11751,5038,0,8814,C
3,10083430,51991062033.0,ANASTROZOLE 1 MG TAB 30,30.0,anastrozole ORAL TABLET 1 MG,1,40038,831,0,900,N
4,10000056,,BERRY OINTMENT JAR 916 WHITE 1,12.0,,1,0,4641,6962,4641,N


In [7]:
import re

# Function to split the generic description into generic name and form
def split_description(desc):
    # Find the index of the first capital letter
    match = re.search(r'[A-Z]', desc)
    if match:
        index = match.start()
        return desc[:index].strip(), desc[index:].strip()
    else:
        return desc, ''

# Applying the function to the 'Generic Description' column
data_cleaned['Generic Name'], data_cleaned['Form'] = zip(*data_cleaned['Generic Description'].apply(split_description))

# Dropping the original 'Generic Description' column
data_cleaned.drop(columns=['Generic Description'], inplace=True)

# Displaying the first few rows of the updated dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Description,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form
0,10000009,51672408306.0,HYDROCORTISONE BUTYRATE 0.1% O,45.0,1,17928,9425,0,14342,C,hydrocortisone butyrate,TOPICA
1,10083412,23334807.0,OZURDEX IMP 1 DS,,1,164760,137300,0,137300,N,dexamethasone,INTRAOCULR IMPLA
2,10000013,93227234.0,AMOXICILLIN-CLAV K 400-57 MG C,20.0,1,11751,5038,0,8814,C,amoxicillin/potassium clav,ORA
3,10083430,51991062033.0,ANASTROZOLE 1 MG TAB 30,30.0,1,40038,831,0,900,N,anastrozole,ORAL TABLET 1 MG
4,10000056,,BERRY OINTMENT JAR 916 WHITE 1,12.0,1,0,4641,6962,4641,N,,


In [10]:
# Removing rows where the 'Generic Name' is empty or consists only of whitespace
data_cleaned = data_cleaned[data_cleaned['Generic Name'].str.strip() != '']

# Displaying the first few rows of the updated dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Description,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form
0,10000009,51672408306,HYDROCORTISONE BUTYRATE 0.1% O,45,1,17928,9425,0,14342,C,hydrocortisone butyrate,TOPICA
1,10083412,23334807,OZURDEX IMP 1 DS,,1,164760,137300,0,137300,N,dexamethasone,INTRAOCULR IMPLA
2,10000013,93227234,AMOXICILLIN-CLAV K 400-57 MG C,20,1,11751,5038,0,8814,C,amoxicillin/potassium clav,ORA
3,10083430,51991062033,ANASTROZOLE 1 MG TAB 30,30,1,40038,831,0,900,N,anastrozole,ORAL TABLET 1 MG
5,10083433,67457022005,ISOSULFAN BLUE 1% SDV 6X5 ML,6X5,1,763841,587712,0,636534,N,isosulfan blue,SUBCUT VIAL 1 %


In [12]:
# Function to split the description into name and size
def split_description_on_number(desc):
    # Find the index of the first number
    match = re.search(r'\d', desc)
    if match:
        index = match.start()
        return desc[:index].strip(), desc[index:].strip()
    else:
        return desc, ''

# Applying the function to the 'Description' column
data_cleaned['Name'], data_cleaned['Size'] = zip(*data_cleaned['Description'].apply(split_description_on_number))

# Dropping the original 'Description' column
data_cleaned.drop(columns=['Description'], inplace=True)

# Displaying the first few rows of the updated dataframe
data_cleaned.head()


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
0,10000009,51672408306,45,1,17928,9425,0,14342,C,hydrocortisone butyrate,TOPICA,HYDROCORTISONE BUTYRATE,0.1% O
1,10083412,23334807,,1,164760,137300,0,137300,N,dexamethasone,INTRAOCULR IMPLA,OZURDEX IMP,1 DS
2,10000013,93227234,20,1,11751,5038,0,8814,C,amoxicillin/potassium clav,ORA,AMOXICILLIN-CLAV K,400-57 MG C
3,10083430,51991062033,30,1,40038,831,0,900,N,anastrozole,ORAL TABLET 1 MG,ANASTROZOLE,1 MG TAB 30
5,10083433,67457022005,6X5,1,763841,587712,0,636534,N,isosulfan blue,SUBCUT VIAL 1 %,ISOSULFAN BLUE,1% SDV 6X5 ML


In [18]:
# Filtering the dataset for items with the exact generic name "lidocaine"
data_lidocaine = data_cleaned[data_cleaned['Generic Name'].str.lower() == 'lidocaine']

data_lidocaine


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size
354,10001884,00264959420,24X250,1,21254,16184,0,17712,C,lidocaine,HCl/dextrose 5 %/PF,LIDOCAINE,0.4%-D5W 1 GM BAG 24
654,10004082,00264959410,24X500,1,24336,22093,0,22093,C,lidocaine,HCl/dextrose 5 %/PF,LIDOCAINE,0.4%-D5W 4MG/ML BAG
873,10089672,63323048737,25X30,1,35490,27307,0,29575,N,lidocaine,HCl/epinephrine/PF I,XYLOCAINE-EPI MPF,1%-1:200K VL
1052,10090166,63323049527,25X2,1,8790,6764,0,7325,N,lidocaine,HCl/PF INJECTION VIA,XYLOCAINE MPF,2% SDV 25X2 ML
1192,10090846,00496088206,5X5,1,4200,3850,5775,3500,N,lidocaine,TOPICAL CREAM (G) 4,LMX,4 4% CRM 5X5 GM
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38832,10059788,00409206605,10X5,1,3283,2239,0,2736,C,lidocaine,HCl/PF INJECTION VIA,LIDOCAINE HCL,2% SDV 10X5 ML
38854,10232237,69367020285,85,1,17469,3218,0,10101,C,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 85 GM
38868,10232207,69367020201,28.35,1,5969,1079,0,4202,C,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 28.35 GM
38955,10232507,00527600480,50,1,6464,2303,0,2800,C,lidocaine,HCl MUCOUS MEM SOLUT,LIDOCAINE HCL,4% TOPICAL SOL 5


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Selecting the reference item
reference_item_number = 10090846
reference_item = data_lidocaine[data_lidocaine['Item Number – 8 digit'] == reference_item_number]

# Check if the reference item is found
if reference_item.empty:
    similarity_results = "Reference item not found in the dataset."
else:
    # Extracting the form of the reference item
    reference_form = reference_item.iloc[0]['Form']

    # Preparing data for TF-IDF vectorization (including the reference form)
    forms = data_lidocaine['Form'].tolist()
    forms.insert(0, reference_form)

    # Vectorizing the forms using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(forms)

    # Calculating cosine similarity
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    # Adding the similarity scores to the lidocaine dataset
    # We exclude the first element of cosine_similarities as it is the similarity of the reference item with itself
    data_lidocaine['Similarity'] = cosine_similarities

    # Displaying the dataset with similarity scores
    similarity_results = data_lidocaine

  # Display the first few rows of the results

# Filtering the dataset to show only items with a similarity score above 0.9
similarity_items = data_lidocaine[data_lidocaine['Similarity'] > 0.9]

similarity_items



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_lidocaine['Similarity'] = cosine_similarities


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Similarity
1192,10090846,496088206,5X5,1,4200,3850,5775,3500,N,lidocaine,TOPICAL CREAM (G) 4,LMX,4 4% CRM 5X5 GM,1.0
2632,10098876,24357070115,15,1,1920,1093,1640,1600,C,lidocaine,TOPICAL CREAM (G) 4,ANECREAM,4% 15 GM,1.0
3416,10103480,496089230,30,1,2771,2540,3810,2309,N,lidocaine,TOPICAL CREAM (G) 5,RECTICARE,5% ANORECTAL CRM 30,1.0
3580,10104122,13925015901,28,1,5835,4326,0,4685,N,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 28 GM,0.921162
3581,10104123,13925015903,3,1,17505,12987,0,14065,N,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 85 GM,0.921162
3628,10104403,24357070230,30,1,4440,4070,6105,3700,N,lidocaine,TOPICAL CREAM (G) 5,ANECREAM,5% 30 GM,1.0
3732,10104982,24357070215,15,1,2400,2200,3300,2000,N,lidocaine,TOPICAL CREAM (G) 5,ANECREAM,5% 15 GM,1.0
5235,10114467,59088099707,85,1,12252,4726,0,10210,C,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 85 GM PPR,0.921162
5256,10114680,59088099703,28,1,5160,1904,0,4300,C,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 28 GM PPR,0.921162
7412,10136471,24357070106,5X5,1,2100,1439,2159,1750,C,lidocaine,TOPICAL CREAM (G) 4,ANECREAM,4 % TUBE CRM 5X5 GM,1.0


In [25]:
# Extracting the size of the reference item
reference_size = reference_item.iloc[0]['Size']

# Preparing data for TF-IDF vectorization (including the reference size)
sizes = similarity_items['Size'].tolist()
sizes.insert(0, reference_size)

# Vectorizing the sizes using TF-IDF
tfidf_matrix_sizes = vectorizer.fit_transform(sizes)

# Calculating cosine similarity for sizes
cosine_similarities_sizes = cosine_similarity(tfidf_matrix_sizes[0:1], tfidf_matrix_sizes[1:]).flatten()

# Adding the similarity scores for sizes to the high_similarity_items dataset
# We exclude the first element of cosine_similarities_sizes as it is the similarity of the reference item with itself
similarity_items['Size Similarity'] = cosine_similarities_sizes

# Displaying the dataset with size similarity scores
high_similarity_items_filtered_by_size = similarity_items[similarity_items['Size Similarity'] > 0]
high_similarity_items_filtered_by_size  # Display the first few rows of the results


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similarity_items['Size Similarity'] = cosine_similarities_sizes


Unnamed: 0,Item Number – 8 digit,NDC Number,Size Qty,Retail Pack Quantity,AWP Price,Acquisition Price,Retail Price,WAC Price,Contract Flag,Generic Name,Form,Name,Size,Similarity,Size Similarity
1192,10090846,496088206,5X5,1,4200,3850,5775,3500,N,lidocaine,TOPICAL CREAM (G) 4,LMX,4 4% CRM 5X5 GM,1.0,1.0
2632,10098876,24357070115,15,1,1920,1093,1640,1600,C,lidocaine,TOPICAL CREAM (G) 4,ANECREAM,4% 15 GM,1.0,0.152849
3416,10103480,496089230,30,1,2771,2540,3810,2309,N,lidocaine,TOPICAL CREAM (G) 5,RECTICARE,5% ANORECTAL CRM 30,1.0,0.095978
3580,10104122,13925015901,28,1,5835,4326,0,4685,N,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 28 GM,0.921162,0.242391
3581,10104123,13925015903,3,1,17505,12987,0,14065,N,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 85 GM,0.921162,0.229576
3628,10104403,24357070230,30,1,4440,4070,6105,3700,N,lidocaine,TOPICAL CREAM (G) 5,ANECREAM,5% 30 GM,1.0,0.162848
3732,10104982,24357070215,15,1,2400,2200,3300,2000,N,lidocaine,TOPICAL CREAM (G) 5,ANECREAM,5% 15 GM,1.0,0.152849
5235,10114467,59088099707,85,1,12252,4726,0,10210,C,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 85 GM PPR,0.921162,0.165941
5256,10114680,59088099703,28,1,5160,1904,0,4300,C,lidocaine,HCl TOPICAL CREAM (G,LIDOCAINE HCL,3% CRM 28 GM PPR,0.921162,0.170592
7412,10136471,24357070106,5X5,1,2100,1439,2159,1750,C,lidocaine,TOPICAL CREAM (G) 4,ANECREAM,4 % TUBE CRM 5X5 GM,1.0,0.684028
